In [1]:
# Data Analysis
import pandas as pd
import numpy as np

# Machine Learning Models  
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier


# Data preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


# Evaluation Metric
from sklearn.metrics import f1_score as f1


# Deployment 
import pickle

In [2]:
df = pd.read_csv('BankChurners.csv')
df.drop('CLIENTNUM', axis = 1,inplace = True)
df.drop(df.columns[-2:], axis=1, inplace = True)

In [3]:
df.head()

Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,5,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,6,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,4,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,3,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,5,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Attrition_Flag            10127 non-null  object 
 1   Customer_Age              10127 non-null  int64  
 2   Gender                    10127 non-null  object 
 3   Dependent_count           10127 non-null  int64  
 4   Education_Level           10127 non-null  object 
 5   Marital_Status            10127 non-null  object 
 6   Income_Category           10127 non-null  object 
 7   Card_Category             10127 non-null  object 
 8   Months_on_book            10127 non-null  int64  
 9   Total_Relationship_Count  10127 non-null  int64  
 10  Months_Inactive_12_mon    10127 non-null  int64  
 11  Contacts_Count_12_mon     10127 non-null  int64  
 12  Credit_Limit              10127 non-null  float64
 13  Total_Revolving_Bal       10127 non-null  int64  
 14  Avg_Op

In [5]:
df.describe()

Unnamed: 0,Customer_Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
count,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0
mean,46.32596,2.346203,35.928409,3.81258,2.341167,2.455317,8631.953698,1162.814061,7469.139637,0.759941,4404.086304,64.858695,0.712222,0.274894
std,8.016814,1.298908,7.986416,1.554408,1.010622,1.106225,9088.77665,814.987335,9090.685324,0.219207,3397.129254,23.47257,0.238086,0.275691
min,26.0,0.0,13.0,1.0,0.0,0.0,1438.3,0.0,3.0,0.0,510.0,10.0,0.0,0.0
25%,41.0,1.0,31.0,3.0,2.0,2.0,2555.0,359.0,1324.5,0.631,2155.5,45.0,0.582,0.023
50%,46.0,2.0,36.0,4.0,2.0,2.0,4549.0,1276.0,3474.0,0.736,3899.0,67.0,0.702,0.176
75%,52.0,3.0,40.0,5.0,3.0,3.0,11067.5,1784.0,9859.0,0.859,4741.0,81.0,0.818,0.503
max,73.0,5.0,56.0,6.0,6.0,6.0,34516.0,2517.0,34516.0,3.397,18484.0,139.0,3.714,0.999


In [6]:
df.shape

(10127, 20)

In [7]:
#Getting all the variables from the dataset which are of non-number values.
categorical_features= [feature for feature in df.columns if df[feature].dtypes == 'O' and feature !='Attrition_Flag']
categorical_features

['Gender',
 'Education_Level',
 'Marital_Status',
 'Income_Category',
 'Card_Category']

In [64]:
df.Attrition_Flag = df.Attrition_Flag.replace({'Attrited Customer':1,'Existing Customer':0})

from sklearn.preprocessing import LabelEncoder
# instantiate labelencoder object
le = LabelEncoder()

# apply le on categorical feature columns
df[categorical_features] = df[categorical_features].apply(lambda col: le.fit_transform(col))    
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

#One-hot-encode the categorical columns.
#Unfortunately outputs an array instead of dataframe.
array_hot_encoded = ohe.fit_transform(df[categorical_features])

#Convert it to df
data_hot_encoded = pd.DataFrame(array_hot_encoded, index=df.index)

#Extract only the columns that didnt need to be encoded
data_other_cols = df.drop(columns=categorical_features)

#Concatenate the two dataframes : 
data_out = pd.concat([data_hot_encoded, data_other_cols], axis=1)

In [8]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(drop='first')

transformed_data = onehotencoder.fit_transform(df[['Gender']])

# the above transformed_data is an array so convert it to dataframe
encoded_data = pd.DataFrame(transformed_data, index=df.index)
encoded_data

# now concatenate the original data and the encoded data using pandas
concatenated_data = pd.concat([df, encoded_data], axis=1)

In [9]:
encoded_data.iloc[3]

0    
Name: 3, dtype: object

In [56]:
df['Income_Category'].unique()

array(['$60K - $80K', 'Less than $40K', '$80K - $120K', '$40K - $60K',
       '$120K +', 'Unknown'], dtype=object)

In [66]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

transformed_data1 = ohe.fit_transform(df[['Gender']]).toarray()
transformed_data2 = ohe.fit_transform(df[['Education_Level']]).toarray()
transformed_data3 = ohe.fit_transform(df[['Marital_Status']]).toarray()
transformed_data4 = ohe.fit_transform(df[['Income_Category']]).toarray()
transformed_data5 = ohe.fit_transform(df[['Card_Category']]).toarray()
# the above transformed_data is an array so convert it to dataframe
encoded_data1 = pd.DataFrame(transformed_data1, index=df.index,columns=['male','female'])
encoded_data2 = pd.DataFrame(transformed_data2, index=df.index,columns=['High School', 'Graduate', 'Uneducated', 'Unknown', 'College',
       'Post-Graduate', 'Doctorate'])
encoded_data3 = pd.DataFrame(transformed_data3, index=df.index,columns=['Married','Single', 'Unknown', 'Divorced'])
encoded_data4 = pd.DataFrame(transformed_data4, index=df.index,columns=['$60K - $80K','Less than $40K', '$80K - $120K', '$40K - $60K',
       '$120K +', 'Unknown'])
encoded_data5 = pd.DataFrame(transformed_data5, index=df.index,columns=['Blue','Gold', 'Silver', 'Platinum'])
# now concatenate the original data and the encoded data using pandas
concatenated_data = pd.concat([df, encoded_data1,encoded_data2,encoded_data3,encoded_data4,encoded_data5], axis=1)

In [67]:
# ohe.fit_transform(df[['Card_Category']]).toarray()
concatenated_data.head()

Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,...,$60K - $80K,Less than $40K,$80K - $120K,$40K - $60K,$120K +,Unknown,Blue,Gold,Silver,Platinum
0,0,45,M,3,High School,Married,$60K - $80K,Blue,39,5,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0,49,F,5,Graduate,Single,Less than $40K,Blue,44,6,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0,51,M,3,Graduate,Married,$80K - $120K,Blue,36,4,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0,40,F,4,High School,Unknown,Less than $40K,Blue,34,3,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,5,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [None]:
concatenated_data.

In [None]:
# df.Attrition_Flag = df.Attrition_Flag.replace({'Attrited Customer':1,'Existing Customer':0})
# df.Gender = df.Gender.replace({'F':1,'M':0})
# df = pd.concat([df,pd.get_dummies(df['Education_Level'])#.drop(columns=['Unknown'])],axis=1)
# df = pd.concat([df,pd.get_dummies(df['Income_Category']).drop(columns=['Unknown'])],axis=1)
# df = pd.concat([df,pd.get_dummies(df['Marital_Status']).drop(columns=['Unknown'])],axis=1)
# df = pd.concat([df,pd.get_dummies(df['Card_Category']).drop(columns=['Platinum'])],axis=1)
# df.drop(columns = ['Education_Level','Income_Category','Marital_Status','Card_Category'],inplace=True)

In [68]:
df.head()

Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,0,45,M,3,High School,Married,$60K - $80K,Blue,39,5,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,0,49,F,5,Graduate,Single,Less than $40K,Blue,44,6,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,0,51,M,3,Graduate,Married,$80K - $120K,Blue,36,4,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,0,40,F,4,High School,Unknown,Less than $40K,Blue,34,3,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,0,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,5,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


In [None]:
df.drop(df.columns[-11:],axis=1) # never used 

In [None]:
oversample = SMOTE()   # upsampling only for training 
X, y = oversample.fit_resample(df[df.columns[1:]], df[df.columns[0]])
usampled_df = X.assign(Churn = y)
usampled_df.head()

In [None]:
ohe_data =usampled_df[usampled_df.columns[15:-1]].copy()

usampled_df = usampled_df.drop(columns=usampled_df.columns[15:-1])

print(usampled_df.shape)
usampled_df.head()

In [None]:
N_COMPONENTS = 4

pca_model = PCA(n_components = N_COMPONENTS )
pc = pca_model.fit(ohe_data)
pc_matrix = pc.transform(ohe_data)

In [None]:
pc_matrix

In [None]:
# Creating a Pickle for PCA

with open('PCA.pickle', 'wb') as pickle_out:
    pickle.dump(pc, pickle_out)

In [None]:
usampled_df_with_pcs = pd.concat([usampled_df,pd.DataFrame(pc_matrix,columns=['PC-{}'.format(i) for i in range(0,N_COMPONENTS)])],axis=1)

In [None]:
X_features = ['Total_Trans_Ct','PC-3','PC-1','PC-0','PC-2','Total_Ct_Chng_Q4_Q1','Total_Relationship_Count']

X = usampled_df_with_pcs[X_features]
y = usampled_df_with_pcs['Churn']

In [None]:
train_x,test_x,train_y,test_y = train_test_split(X,y,random_state=42,test_size=0.2)

In [None]:
rf = RandomForestClassifier(n_estimators=100,random_state=42)
rf.fit(train_x,train_y)

In [None]:
rf.score(test_x,test_y)

In [None]:
f1_cross_val_scores = cross_val_score(rf,train_x,train_y,cv=5,scoring='f1')

In [None]:
# Creating Pickle for Pipeline

with open('model.pickle', 'wb') as pickle_out:
    pickle.dump(rf, pickle_out)

In [None]:
# Creating Pickle for Pipeline

with open('model.pickle', 'wb') as pickle_out:
    pickle.dump(rf, pickle_out)

## Training

In [None]:
rf.fit(train_x,train_y)
rf_prediction = rf.predict(test_x)

In [None]:
np.round(f1(rf_prediction,test_y),2)

## Inference 

In [None]:
ohe_data.iloc[0,:]

In [None]:
PCA = pickle.load(open("PCA.pickle", "rb"))

encoded_data = ohe_data.iloc[0,:]
test_pc = PCA.transform([encoded_data])

In [None]:
test_pc

In [None]:
concat_values = pd.concat([pd.DataFrame(usampled_df.iloc[0,:]).T,pd.DataFrame(test_pc,columns=['PC-{}'.format(i) for i in range(0,N_COMPONENTS)])],axis=1)

In [None]:
test = concat_values[X_features]

In [None]:
model = pickle.load(open("model.pickle", "rb"))
model.predict(test)