In [1]:
import pandas as pd
import numpy as np

In [2]:
churn_data=pd.read_csv('telco.csv')

In [3]:
churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
customerID          7043 non-null object
gender              7043 non-null object
SeniorCitizen       7043 non-null int64
Partner             7043 non-null object
Dependents          7043 non-null object
tenure              7043 non-null int64
PhoneService        7043 non-null object
MultipleLines       7043 non-null object
InternetService     7043 non-null object
OnlineSecurity      7043 non-null object
OnlineBackup        7043 non-null object
DeviceProtection    7043 non-null object
TechSupport         7043 non-null object
StreamingTV         7043 non-null object
StreamingMovies     7043 non-null object
Contract            7043 non-null object
PaperlessBilling    7043 non-null object
PaymentMethod       7043 non-null object
MonthlyCharges      7043 non-null float64
TotalCharges        7043 non-null object
Churn               7043 non-null object
dtypes: float64(1), int64(2), obj

In [4]:
churn_data=churn_data.loc[:,'gender':'Churn']

In [5]:
churn_data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
churn_data['TotalCharges']=churn_data['tenure']*churn_data['MonthlyCharges']

In [7]:
churn_data['TotalCharges']=churn_data['TotalCharges'].infer_objects()

In [8]:
churn_data['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

#### Label Encoding

In [9]:
#creating a dataframe containing categorical attributes only
categorical_df=churn_data.loc[:,'gender':'TotalCharges'].select_dtypes(object)
#fetching the attributes:
categorical_attributes=categorical_df.columns.tolist()
#fetching the index of each categorical attribute:
categorical_indices=[]
for attribute in categorical_attributes:
    categorical_indices.append(churn_data.columns.get_loc(attribute))
#label encoding the indices:
from sklearn.preprocessing import LabelEncoder
encoder_object=LabelEncoder()
for attribute in categorical_attributes:
    churn_data.loc[:,attribute]=encoder_object.fit_transform(churn_data.loc[:,attribute])

In [10]:
categorical_indices

[0, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]

In [11]:
churn_data.loc[:,'Churn']=encoder_object.fit_transform(churn_data.loc[:,'Churn'])

#### One Hot Encoding:

In [12]:
from sklearn.preprocessing import OneHotEncoder
encoder_object2=OneHotEncoder(categorical_features=categorical_indices)
churn_data=encoder_object2.fit_transform(churn_data).toarray()
churn_data=pd.DataFrame(data=churn_data)
splits_df=pd.DataFrame(data={'index':categorical_indices,'attributes':categorical_attributes,'splits':encoder_object2.n_values_})
splits_df

Unnamed: 0,attributes,index,splits
0,gender,0,2
1,Partner,2,2
2,Dependents,3,2
3,PhoneService,5,2
4,MultipleLines,6,3
5,InternetService,7,3
6,OnlineSecurity,8,3
7,OnlineBackup,9,3
8,DeviceProtection,10,3
9,TechSupport,11,3


#### Creating the training set and the testing set:

In [13]:
churn_data[45].value_counts()

0.0    5174
1.0    1869
Name: 45, dtype: int64

In [14]:
negative_df=churn_data[churn_data[45]==0].sample(n=1000,replace=False)
positive_df=churn_data[churn_data[45]==1].sample(n=1000,replace=False)
training_data=pd.concat([negative_df,positive_df])
training_data=training_data.reindex(np.random.permutation(training_data.index))
X_train=training_data.iloc[:,0:45].values
Y_train=training_data.iloc[:,45].values

In [15]:
churn_data=churn_data.drop(training_data.index)
X_test=churn_data.iloc[:,0:45].values
Y_test=churn_data.iloc[:,45].values

#### Standardizing the data :

In [16]:
from sklearn.preprocessing import StandardScaler
standardizer=StandardScaler()
X_train=standardizer.fit_transform(X_train)
X_test=standardizer.transform(X_test)

#### Applying PCA:

In [17]:
from sklearn.decomposition import PCA
pca_obj=PCA(0.95)
X_train=pca_obj.fit_transform(X_train)
X_test=pca_obj.transform(X_test)
print('num_components:',pca_obj.n_components_)

num_components: 18


In [18]:
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.ensemble import RandomForestClassifier as RFC
svm_clf=RFC()
svm_clf.fit(X_train,Y_train)
Y_pred=svm_clf.predict(X_test)
print(accuracy_score(Y_test,Y_pred))
confusion_matrix(Y_test,Y_pred)

0.7326987904025382


array([[3109, 1065],
       [ 283,  586]], dtype=int64)

In [19]:
def neural_network (node_count,layer_count,optimizer):
    import keras
    from keras.models import Sequential
    from keras.layers import Dense
    classifier=Sequential()
    input_layer=Dense(input_dim=18,units=node_count,kernel_initializer='uniform',activation='relu')
    classifier.add(input_layer)
    for count in range(layer_count):
        hidden_layer=Dense(units=node_count,kernel_initializer='uniform',activation='relu')
        classifier.add(hidden_layer)
    output_layer=Dense(units=1,kernel_initializer='uniform',activation='sigmoid')
    classifier.add(output_layer)
    classifier.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
    return classifier

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
neural_clf=KerasClassifier(build_fn=neural_network)
hyperparams={'node_count':[3,5,7,9,11],
            'layer_count':[2,3,4,5,6],
            'optimizer':['adam','rmsprop']}
from sklearn.model_selection import GridSearchCV
grid_object=GridSearchCV(estimator=neural_clf,param_grid=hyperparams,cv=5,scoring='accuracy')
grid_object.fit(X_train,Y_train)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
  32/1600 [..............................] - ETA: 2:36 - loss: 0.6931 - acc: 0.6250