In [45]:
import sklearn
import pandas as pd
import numpy as np
import boto3
import pprint
import os
import time

from numpy.random import RandomState
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import GridSearchCV

In [26]:
df = pd.read_csv('Churn_Modelling.csv')

In [27]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [40]:
df.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [42]:
churn_data = df[['CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited']]

In [43]:
churn_data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [50]:
churn_data.shape

(10000, 11)

In [46]:
columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [1,2])], remainder='passthrough')

In [47]:
data = np.array(columnTransformer.fit_transform(churn_data), dtype = np.str)

In [48]:
data.shape

(10000, 14)

In [51]:
data

array([['1.0', '0.0', '0.0', ..., '1.0', '101348.88', '1.0'],
       ['0.0', '0.0', '1.0', ..., '1.0', '112542.58', '0.0'],
       ['1.0', '0.0', '0.0', ..., '0.0', '113931.57', '1.0'],
       ...,
       ['1.0', '0.0', '0.0', ..., '1.0', '42085.58', '1.0'],
       ['0.0', '1.0', '0.0', ..., '0.0', '92888.52', '1.0'],
       ['1.0', '0.0', '0.0', ..., '0.0', '38190.78', '0.0']], dtype='<U32')

In [52]:
df_final = pd.DataFrame(data)

In [53]:
df_final.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1.0,0.0,0.0,1.0,0.0,619.0,42.0,2.0,0.0,1.0,1.0,1.0,101348.88,1.0
1,0.0,0.0,1.0,1.0,0.0,608.0,41.0,1.0,83807.86,1.0,0.0,1.0,112542.58,0.0
2,1.0,0.0,0.0,1.0,0.0,502.0,42.0,8.0,159660.8,3.0,1.0,0.0,113931.57,1.0
3,1.0,0.0,0.0,1.0,0.0,699.0,39.0,1.0,0.0,2.0,0.0,0.0,93826.63,0.0
4,0.0,0.0,1.0,1.0,0.0,850.0,43.0,2.0,125510.82,1.0,1.0,1.0,79084.1,0.0


In [63]:
rng = RandomState()

train = df_final.sample(frac=0.7, random_state=rng)

test = df_final.loc[~df_final.index.isin(train.index)]

In [64]:
rng = RandomState()

validation = test.sample(frac=0.5, random_state=rng)

test = test.loc[~test.index.isin(validation.index)]

In [65]:
train.shape

(7000, 14)

In [66]:
test.shape

(1500, 14)

In [67]:
validation.shape

(1500, 14)

In [69]:
train.to_csv('train.csv', index=False, header = False)

In [70]:
test.to_csv('test.csv', index=False, header = False)

In [71]:
validation.to_csv('validation.csv', index=False, header = False)

In [68]:
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
4649,0.0,1.0,0.0,0.0,1.0,670.0,31.0,1.0,142631.54,2.0,1.0,1.0,175894.24,0.0
4918,1.0,0.0,0.0,1.0,0.0,717.0,36.0,2.0,99472.76,2.0,1.0,0.0,94274.72,1.0
9894,1.0,0.0,0.0,1.0,0.0,521.0,77.0,6.0,0.0,2.0,1.0,1.0,49054.1,0.0
4375,0.0,1.0,0.0,0.0,1.0,850.0,41.0,8.0,60880.68,1.0,1.0,0.0,31825.84,0.0
5038,1.0,0.0,0.0,1.0,0.0,612.0,63.0,2.0,126473.33,1.0,0.0,1.0,147545.65,0.0


In [72]:
bucket_name = 'saurav-ml-sagemaker'

training_folder = r'churnmodel/training/'
validation_folder = r'churnmodel/validation/'
test_folder = r'churnmodel/test/'

s3_model_output_location = r's3://{0}/churnmodel/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_folder)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name,validation_folder)
s3_test_file_location = r's3://{0}/{1}'.format(bucket_name,test_folder)

In [73]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_validation_file_location)
print(s3_test_file_location)

s3://saurav-ml-sagemaker/churnmodel/model
s3://saurav-ml-sagemaker/churnmodel/training/
s3://saurav-ml-sagemaker/churnmodel/validation/
s3://saurav-ml-sagemaker/churnmodel/test/


In [74]:
def write_to_s3(filename, bucket, key):
    session = boto3.session.Session()
    client = session.client('s3')
    client.upload_file(filename, bucket, key)

In [75]:
write_to_s3('train.csv', 
            bucket_name,
            training_folder + 'train.csv')

In [76]:
write_to_s3('test.csv', 
            bucket_name,
            test_folder + 'test.csv')

In [77]:
write_to_s3('validation.csv', 
            bucket_name,
            validation_folder + 'validation.csv')

In [22]:
train.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
6706,6707,15732235,Kuykendall,662,France,Male,64,0,98848.19,1,0,1,42730.12,0
1669,1670,15582259,Campbell,567,France,Female,37,7,0.0,2,1,1,28690.9,0
626,627,15626900,Kung,427,France,Male,29,1,141325.56,1,1,1,93839.3,0
2071,2072,15697214,Korovin,686,Spain,Female,36,5,0.0,2,1,1,152979.14,0
4103,4104,15693337,Perry,683,Spain,Male,41,0,148863.17,1,1,1,163911.32,0


In [23]:
train.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [24]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7000 entries, 6706 to 7791
Data columns (total 14 columns):
RowNumber          7000 non-null int64
CustomerId         7000 non-null int64
Surname            7000 non-null object
CreditScore        7000 non-null int64
Geography          7000 non-null object
Gender             7000 non-null object
Age                7000 non-null int64
Tenure             7000 non-null int64
Balance            7000 non-null float64
NumOfProducts      7000 non-null int64
HasCrCard          7000 non-null int64
IsActiveMember     7000 non-null int64
EstimatedSalary    7000 non-null float64
Exited             7000 non-null int64
dtypes: float64(2), int64(9), object(3)
memory usage: 820.3+ KB
