In [None]:
import pandas as pd
import numpy as np

df_churn = pd.read_csv("https://raw.githubusercontent.com/rujual/telco_churn_pipeline/master/Data1.csv")

empty_cols = ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection','TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']

for i in empty_cols:
    df_churn[i]=df_churn[i].replace(" ",np.nan)

df_churn.drop(['customerID'], axis=1, inplace=True)
df_churn = df_churn.dropna()
binary_cols = ['Partner','Dependents','PhoneService','PaperlessBilling']

for i in binary_cols:
    df_churn[i] = df_churn[i].replace({"Yes":1,"No":0})

#Encoding column 'gender'
df_churn['gender'] = df_churn['gender'].replace({"Male":1,"Female":0})


category_cols = ['PaymentMethod','MultipleLines','InternetService','OnlineSecurity',
               'OnlineBackup','DeviceProtection',
               'TechSupport','StreamingTV','StreamingMovies','Contract']

for cc in category_cols:
    dummies = pd.get_dummies(df_churn[cc], drop_first=False)
    dummies = dummies.add_prefix("{}#".format(cc))
    df_churn.drop(cc, axis=1, inplace=True)
    df_churn = df_churn.join(dummies)

df_churn['Churn'] = df_churn['Churn'].replace({"Yes":1,"No":0})


import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import json
import os

df_churn.dropna(inplace=True)
n_est = 100

y1 = df_churn['Churn']
X1 = df_churn.drop(['Churn'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(X1, y1, random_state=0)
rfc_best=RandomForestClassifier(random_state=42, max_features='auto', n_estimators= 50, max_depth=8,
                                criterion='gini')

rfc_best.fit(X_train, y_train) 
y_test_pred = rfc_best.predict(X_test)
rf_score = rfc_best.score(X_test, y_test)
conf = confusion_matrix(y_test, y_test_pred)
print(conf)

In [None]:
vocab = list(y_test.unique())
cm = confusion_matrix(y_test, y_test_pred, labels=vocab)
data = []
for target_index, target_row in enumerate(cm):
    for predicted_index, count in enumerate(target_row):
        data.append((vocab[target_index], vocab[predicted_index], count))

df_cm = pd.DataFrame(data, columns=['target', 'predicted', 'count'])
df_cm

In [None]:
df_cm.to_csv('Conf_mat.csv', index=False)

In [None]:
metadata = {
    'outputs' : [{
      'type': 'confusion_matrix',
      'format': 'csv',
      'schema': [
        {'name': 'target', 'type': 'CATEGORY'},
        {'name': 'predicted', 'type': 'CATEGORY'},
        {'name': 'count', 'type': 'NUMBER'},
      ],
      'source': 'gs://mlopstest/Conf_mat.csv',
      # Convert vocab to string because for bealean values we want "True|False" to match csv data.
      'labels': list(map(str, vocab)),
    }]
  }
with open('metadata.json', 'w') as f:#file_io.FileIO('metadata.json', 'w') as f:
    json.dump(metadata, f)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score


metadata = {
'outputs' : [{
  'type': 'confusion_matrix',
  'format': 'csv',
#   'schema': [
#     {'name': 'target', 'type': 'CATEGORY'},
#     {'name': 'predicted', 'type': 'CATEGORY'},
#     {'name': 'count', 'type': 'NUMBER'},
#   ],
  'source': 'gs://mlopstest/Conf_mat.csv',
  # Convert vocab to string because for bealean values we want "True|False" to match csv data.
  #'labels': list(map(str, vocab)),
}]
}

with open('metadata.json', 'w+') as f1:
    json.dump(metadata, f1)

#json.dump(metadata, metadata_out)

accuracy = accuracy_score(y_test, y_test_pred)
metrics = {
'metrics': [{
  'name': 'accuracy-score',
  'numberValue':  accuracy,
  'format': "PERCENTAGE",
}]
}
#with file_io.FileIO('/mlpipeline-metrics.json', 'w') as f:
with open('metrics.json', 'w+') as f:
    json.dump(metrics, f)

In [None]:
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# A program to generate ROC data out of prediction results.
# Usage:
# python roc.py  \
#   --predictions=gs://bradley-playground/sfpd/predictions/part-* \
#   --trueclass=ACTION \
#   --output=gs://bradley-playground/sfpd/roc/ \


import argparse
import json
import os
from urllib.parse import urlparse
import pandas as pd
from sklearn.metrics import roc_curve, roc_auc_score
from tensorflow.python.lib.io import file_io


def main(argv=None):
    parser = argparse.ArgumentParser(description='ML Trainer')
    parser.add_argument('--predictions', type=str, help='GCS path of prediction file pattern.')
    parser.add_argument('--trueclass', type=str, default='true',
                      help='The name of the class as true value. If missing, assuming it is ' +
                           'binary classification and default to "true".')
    parser.add_argument('--true_score_column', type=str, default='true',
                      help='The name of the column for positive prob. If missing, assuming it is ' +
                           'binary classification and defaults to "true".')
    parser.add_argument('--target_lambda', type=str,
                      help='a lambda function as a string to determine positive or negative.' +
                           'For example, "lambda x: x[\'a\'] and x[\'b\']". If missing, ' +
                           'input must have a "target" column.')
    parser.add_argument('--output', type=str, help='GCS path of the output directory.')
    args = parser.parse_args()

    storage_service_scheme = urlparse.urlparse(args.output).scheme
    on_cloud = True if storage_service_scheme else False
    
    if not on_cloud and not os.path.exists(args.output):
        os.makedirs(args.output)

    schema_file = os.path.join(os.path.dirname(args.predictions), 'schema.json')
    schema = json.loads(file_io.read_file_to_string(schema_file))
    names = [x['name'] for x in schema]

    if not args.target_lambda and 'target' not in names:
        raise ValueError('There is no "target" column, and target_lambda is not provided.')

    if args.true_score_column not in names:
        raise ValueError('Cannot find column name "%s"' % args.true_score_column)

    dfs = []
    files = file_io.get_matching_files(args.predictions)
    
    for file in files:
        with file_io.FileIO(file, 'r') as f:
            dfs.append(pd.read_csv(f, names=names))

    df = pd.concat(dfs)
    
    if args.target_lambda:
        df['target'] = df.apply(eval(args.target_lambda), axis=1)
    else:
        df['target'] = df['target'].apply(lambda x: 1 if x == args.trueclass else 0)
    
    
    fpr, tpr, thresholds = roc_curve(df['target'], df[args.true_score_column])
    roc_auc = roc_auc_score(df['target'], df[args.true_score_column])
    df_roc = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'thresholds': thresholds})
    roc_file = os.path.join(args.output, 'roc.csv')
    
    with file_io.FileIO(roc_file, 'w') as f:
        df_roc.to_csv(f, columns=['fpr', 'tpr', 'thresholds'], header=False, index=False)

    metadata = {
    'outputs': [{
      'type': 'roc',
      'format': 'csv',
      'schema': [
        {'name': 'fpr', 'type': 'NUMBER'},
        {'name': 'tpr', 'type': 'NUMBER'},
        {'name': 'thresholds', 'type': 'NUMBER'},
      ],
      'source': roc_file
    }]
    }
    
    with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as f:
        json.dump(metadata, f)

    metrics = {
    'metrics': [{
      'name': 'roc-auc-score',
      'numberValue':  roc_auc,
    }]
    }
    
    with file_io.FileIO('/mlpipeline-metrics.json', 'w') as f:
        json.dump(metrics, f)

if __name__== "__main__":
    main()

In [1]:
from google.cloud import storage

In [2]:
 # """Uploads a file to the bucket."""
storage_client = storage.Client()
bucket = storage_client.get_bucket(mlopstest)

DefaultCredentialsError: Could not automatically determine credentials. Please set GOOGLE_APPLICATION_CREDENTIALS or explicitly create credentials and re-run the application. For more information, please see https://cloud.google.com/docs/authentication/getting-started

In [None]:
blob = bucket.blob(destination_blob_name)
blob.upload_from_filename(source_file_name)

print('File {} uploaded to {}.'.format(source_file_name, destination_blob_name))

In [None]:
#code to generate artifacts

import json
import os
from sklearn.metrics import confusion_matrix, accuracy_score

vocab = list(y_test.unique())
cm = confusion_matrix(y_test, y_test_pred, labels=vocab)
data = []
for target_index, target_row in enumerate(cm):
    for predicted_index, count in enumerate(target_row):
        data.append((vocab[target_index], vocab[predicted_index], count))

df_cm = pd.DataFrame(data, columns=['target', 'predicted', 'count'])

#df_cm.to_csv(conf_matr, columns=['target', 'predicted', 'count'], header=False, index=False)

metadata = {
'outputs' : [{
  'type': 'confusion_matrix',
  'format': 'csv',
  'schema': [
    {'name': 'target', 'type': 'CATEGORY'},
    {'name': 'predicted', 'type': 'CATEGORY'},
    {'name': 'count', 'type': 'NUMBER'},
  ],
  'source': 'conf_matr',
  # Convert vocab to string because for bealean values we want "True|False" to match csv data.
  'labels': list(map(str, vocab)),
}]
}

print("meteadata: ",metadata)
with open("metadata_out.json", 'w+') as f1:
    json.dump(metadata, f1)

accuracy = accuracy_score(y_test, y_test_pred)
metrics = {
'metrics': [{
  'name': 'accuracy-score',
  'numberValue':  accuracy,
  'format': "PERCENTAGE",
}]
}
with open('metrics_out.json', 'w+') as f:
    json.dump(metrics, f)
    
print("\n\n\nmetrics: ",metrics)