In [1]:
import numpy as np
import pandas as pd
pd.set_option("display.max.columns", None)

In [2]:
churn = pd.read_csv('../data/mini-case-studies/churn.csv', sep=',')

In [3]:
churn.head()

Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False.
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False.
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False.


In [4]:
data_size = churn.shape
print(data_size)

(3333, 21)


In [5]:
churn_col_names = list(churn.columns)
print(churn_col_names)

['State', 'Account Length', 'Area Code', 'Phone', "Int'l Plan", 'VMail Plan', 'VMail Message', 'Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins', 'Eve Calls', 'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge', 'Intl Mins', 'Intl Calls', 'Intl Charge', 'CustServ Calls', 'Churn?']


In [6]:
print(churn.describe())

       Account Length    Area Code  VMail Message     Day Mins    Day Calls  \
count     3333.000000  3333.000000    3333.000000  3333.000000  3333.000000   
mean       101.064806   437.182418       8.099010   179.775098   100.435644   
std         39.822106    42.371290      13.688365    54.467389    20.069084   
min          1.000000   408.000000       0.000000     0.000000     0.000000   
25%         74.000000   408.000000       0.000000   143.700000    87.000000   
50%        101.000000   415.000000       0.000000   179.400000   101.000000   
75%        127.000000   510.000000      20.000000   216.400000   114.000000   
max        243.000000   510.000000      51.000000   350.800000   165.000000   

        Day Charge     Eve Mins    Eve Calls   Eve Charge   Night Mins  \
count  3333.000000  3333.000000  3333.000000  3333.000000  3333.000000   
mean     30.562307   200.980348   100.114311    17.083540   200.872037   
std       9.259435    50.713844    19.922625     4.310668    50.57

In [7]:
churn['State'].unique()

array(['KS', 'OH', 'NJ', 'OK', 'AL', 'MA', 'MO', 'LA', 'WV', 'IN', 'RI',
       'IA', 'MT', 'NY', 'ID', 'VT', 'VA', 'TX', 'FL', 'CO', 'AZ', 'SC',
       'NE', 'WY', 'HI', 'IL', 'NH', 'GA', 'AK', 'MD', 'AR', 'WI', 'OR',
       'MI', 'DE', 'UT', 'CA', 'MN', 'SD', 'NC', 'WA', 'NM', 'NV', 'DC',
       'KY', 'ME', 'MS', 'TN', 'PA', 'CT', 'ND'], dtype=object)

In [8]:
# Identifying the outcome/target variable:
# Churn = True means customer will churn. 
churn_target = churn['Churn?'] 
print(churn_target)

0       False.
1       False.
2       False.
3       False.
4       False.
         ...  
3328    False.
3329    False.
3330    False.
3331    False.
3332    False.
Name: Churn?, Length: 3333, dtype: object


In [9]:
# Phone number : unique number (might not influence prediction)
# Churn? : target variable (not required in feature set)
cols_to_drop = ['Phone', 'Churn?']
# axis=1 depicts drop along columns
churn_feature = churn.drop(cols_to_drop, axis=1)
print(churn_feature)

     State  Account Length  Area Code Int'l Plan VMail Plan  VMail Message  \
0       KS             128        415         no        yes             25   
1       OH             107        415         no        yes             26   
2       NJ             137        415         no         no              0   
3       OH              84        408        yes         no              0   
4       OK              75        415        yes         no              0   
...    ...             ...        ...        ...        ...            ...   
3328    AZ             192        415         no        yes             36   
3329    WV              68        415         no         no              0   
3330    RI              28        510         no         no              0   
3331    CT             184        510        yes         no              0   
3332    TN              74        415         no        yes             25   

      Day Mins  Day Calls  Day Charge  Eve Mins  Eve Calls  Eve

In [10]:
churn_categorical = churn.select_dtypes(include=[object])
print(churn_categorical)

     State     Phone Int'l Plan VMail Plan  Churn?
0       KS  382-4657         no        yes  False.
1       OH  371-7191         no        yes  False.
2       NJ  358-1921         no         no  False.
3       OH  375-9999        yes         no  False.
4       OK  330-6626        yes         no  False.
...    ...       ...        ...        ...     ...
3328    AZ  414-4276         no        yes  False.
3329    WV  370-3271         no         no  False.
3330    RI  328-8230         no         no  False.
3331    CT  364-6381        yes         no  False.
3332    TN  400-4344         no        yes  False.

[3333 rows x 5 columns]


In [11]:
yes_no_cols = ["Int'l Plan","VMail Plan"]
churn_feature[yes_no_cols] = churn_feature[yes_no_cols] == 'yes'
print(churn_feature)

     State  Account Length  Area Code  Int'l Plan  VMail Plan  VMail Message  \
0       KS             128        415       False        True             25   
1       OH             107        415       False        True             26   
2       NJ             137        415       False       False              0   
3       OH              84        408        True       False              0   
4       OK              75        415        True       False              0   
...    ...             ...        ...         ...         ...            ...   
3328    AZ             192        415       False        True             36   
3329    WV              68        415       False       False              0   
3330    RI              28        510       False       False              0   
3331    CT             184        510        True       False              0   
3332    TN              74        415       False        True             25   

      Day Mins  Day Calls  Day Charge  

In [12]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
churn_feature['Area Code'] = label_encoder.fit_transform(churn_feature['Area Code'])
print(churn_feature)

     State  Account Length  Area Code  Int'l Plan  VMail Plan  VMail Message  \
0       KS             128          1       False        True             25   
1       OH             107          1       False        True             26   
2       NJ             137          1       False       False              0   
3       OH              84          0        True       False              0   
4       OK              75          1        True       False              0   
...    ...             ...        ...         ...         ...            ...   
3328    AZ             192          1       False        True             36   
3329    WV              68          1       False       False              0   
3330    RI              28          2       False       False              0   
3331    CT             184          2        True       False              0   
3332    TN              74          1       False        True             25   

      Day Mins  Day Calls  Day Charge  

In [13]:
print('Churn data size before one hot encoding', churn_feature.shape)
print('No of unique states', len(churn_feature['State'].unique()))
# Give the feature and columns to one hot encode in 'columns' and column rename prefix in 'prefix'
churn_dumm = pd.get_dummies(churn_feature, columns=["State"], prefix=["State"])
print('Churn data size after one hot encoding',churn_dumm.shape)
# converting to numpy matrix
churn_matrix = churn_dumm.values.astype(np.float)

Churn data size before one hot encoding (3333, 19)
No of unique states 51
Churn data size after one hot encoding (3333, 69)


In [14]:
from sklearn.impute import SimpleImputer
# Missing values replaced by mean
imp = SimpleImputer(missing_values=np.nan, strategy='mean', fill_value=None, verbose=0, copy=True)
# Fit to data, then transform it
churn_matrix = imp.fit_transform(churn_matrix)

In [15]:
from sklearn.preprocessing import StandardScaler
# Standardize the data by removing the mean and scaling to unit variance
scaler = StandardScaler()
# Fit to data, then transform it.
churn_matrix = scaler.fit_transform(churn_matrix)

In [16]:
seed=7 # To generate same sequence of random numbers
from sklearn.model_selection import train_test_split
# Splitting the data for training and testing (90% train, 10% test)
train_data, test_data, train_label, test_label = train_test_split(churn_matrix, churn_target, test_size=.1, random_state=seed)

In [17]:
from sklearn.tree import DecisionTreeClassifier
# Initializing decision tree classifier
classifier = DecisionTreeClassifier(random_state=seed)
# Model training
classifier = classifier.fit(train_data, train_label)
# After being fitted, the model can then be used to predict the output.
churn_predicted_target = classifier.predict(test_data)
# Evaluating the classifier
score = classifier.score(test_data, test_label)
print('Decision Tree Classifier : ', score)

Decision Tree Classifier :  0.9101796407185628


In [18]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier = classifier.fit(train_data, train_label)
churn_predicted_target = classifier.predict(test_data)
score = classifier.score(test_data, test_label)
print('Naive Bayes : ', score)

Naive Bayes :  0.5748502994011976


In [19]:
from sklearn.linear_model import SGDClassifier
classifier = SGDClassifier(loss='modified_huber', shuffle=True, random_state=seed)
classifier = classifier.fit(train_data, train_label)
churn_predicted_target = classifier.predict(test_data)
score = classifier.score(test_data, test_label)
print('SGD classifier : ', score)

SGD classifier :  0.781437125748503


In [20]:
from sklearn.svm import SVC
classifier = SVC(kernel="linear", C=0.025, random_state=seed)
classifier = classifier.fit(train_data, train_label)
churn_predicted_target = classifier.predict(test_data)
score = classifier.score(test_data, test_label)
print('SVM Classifier : ', score)

SVM Classifier :  0.8473053892215568


In [21]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=10, random_state=seed)
classifier = classifier.fit(train_data, train_label)
churn_predicted_target = classifier.predict(test_data)
score = classifier.score(test_data, test_label)
print('Random Forest Classifier : ', score)

Random Forest Classifier :  0.8682634730538922


In [22]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(max_depth=5, n_estimators=15, max_features=60, random_state=seed)
classifier = classifier.fit(train_data, train_label)
score = classifier.score(test_data, test_label)
print('Random Forest classification after model tuning : ', score)

Random Forest classification after model tuning :  0.937125748502994


In [23]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=7)
sss.get_n_splits(churn_matrix,churn_target)
print(sss.get_n_splits(churn_matrix,churn_target))
print(sss)

1
StratifiedShuffleSplit(n_splits=1, random_state=7, test_size=0.1,
            train_size=None)


In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm
classifiers = [
    DecisionTreeClassifier(),
    GaussianNB(),
    SGDClassifier(loss='modified_huber', shuffle=True),
    SVC(kernel="linear", C=0.025),
    KNeighborsClassifier(),
    OneVsRestClassifier(svm.LinearSVC()),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=10),
    AdaBoostClassifier(),
   ]
for clf in classifiers:
    score=0
    for train_index, test_index in sss.split(churn_matrix, churn_target):
        X_train, X_test = churn_matrix[train_index], churn_matrix[test_index]
        y_train, y_test = churn_target[train_index], churn_target[test_index]
        clf.fit(X_train, y_train)
        score = score + clf.score(X_test, y_test)
    print(score)

0.9041916167664671
0.6137724550898204
0.7934131736526946
0.8562874251497006
0.8652694610778443




0.8652694610778443
0.8772455089820359
0.8832335329341318


In [25]:
from sklearn.metrics import accuracy_score
print('Accuracy Score', accuracy_score(test_label, churn_predicted_target))  

Accuracy Score 0.8682634730538922


In [26]:
from sklearn.metrics import confusion_matrix
print('Confusion Matrix', confusion_matrix(test_label, churn_predicted_target))

Confusion Matrix [[282   1]
 [ 43   8]]


In [27]:
from sklearn.metrics import classification_report
target_names = ['False', 'True']
print(classification_report(test_label, churn_predicted_target, target_names=target_names))

              precision    recall  f1-score   support

       False       0.87      1.00      0.93       283
        True       0.89      0.16      0.27        51

    accuracy                           0.87       334
   macro avg       0.88      0.58      0.60       334
weighted avg       0.87      0.87      0.83       334



## END