In [1]:
import pandas as pd
import numpy as np
import string as str
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.feature_selection import SelectKBest,f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,mean_squared_error as MSE

import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('churn_clean.csv')
df = df.drop(['City','State','County','Customer_id','Interaction','UID','Job','Gender'],axis=1)

In [2]:
df.isna().sum()

CaseOrder               0
Zip                     0
Lat                     0
Lng                     0
Population              0
Area                    0
TimeZone                0
Children                0
Age                     0
Income                  0
Marital                 0
Churn                   0
Outage_sec_perweek      0
Email                   0
Contacts                0
Yearly_equip_failure    0
Techie                  0
Contract                0
Port_modem              0
Tablet                  0
InternetService         0
Phone                   0
Multiple                0
OnlineSecurity          0
OnlineBackup            0
DeviceProtection        0
TechSupport             0
StreamingTV             0
StreamingMovies         0
PaperlessBilling        0
PaymentMethod           0
Tenure                  0
MonthlyCharge           0
Bandwidth_GB_Year       0
Item1                   0
Item2                   0
Item3                   0
Item4                   0
Item5       

In [3]:
#strips leading and trailing spaces
string_list = list(df.select_dtypes(include = {'object'}))
for i in string_list:
    df[i] = df[i].str.strip()
    
# remove outliers where z score is < 3
col_num_names = list(df.select_dtypes(include = {'float64','int64'}))


for i in col_num_names:
    z = pd.DataFrame(np.abs(stats.zscore(df[i])))
    df_clean = df[(z<3).all(axis=1)]
    print(df_clean[i].name,"min is ", df_clean[i].min(),"max is ",df_clean[i].max(),"\n")

CaseOrder min is  1 max is  10000 

Zip min is  601 max is  99929 

Lat min is  24.58549 max is  53.87601 

Lng min is  -135.33619 max is  -65.66785 

Population min is  0 max is  52967 

Children min is  0 max is  8 

Age min is  18 max is  89 

Income min is  348.67 max is  124025.1 

Outage_sec_perweek min is  1.144796 max is  18.85173 

Email min is  3 max is  21 

Contacts min is  0 max is  3 

Yearly_equip_failure min is  0 max is  2 

Tenure min is  1.00025934 max is  71.99928 

MonthlyCharge min is  79.97886 max is  290.160419 

Bandwidth_GB_Year min is  155.5067148 max is  7158.98153 

Item1 min is  1 max is  6 

Item2 min is  1 max is  6 

Item3 min is  1 max is  6 

Item4 min is  1 max is  6 

Item5 min is  1 max is  6 

Item6 min is  1 max is  6 

Item7 min is  1 max is  6 

Item8 min is  1 max is  6 



In [4]:
#population cannot be zero.
count = (df_clean['Population']==0).sum()
print(count)
#less than 1 percent of the data so we can remove the zeros from population
df_clean.drop(df_clean[df_clean['Population'] == 0].index, inplace = True)


new_df = pd.get_dummies(df_clean)
new_df.info()

97
<class 'pandas.core.frame.DataFrame'>
Int64Index: 9888 entries, 0 to 9999
Data columns (total 92 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   CaseOrder                                9888 non-null   int64  
 1   Zip                                      9888 non-null   int64  
 2   Lat                                      9888 non-null   float64
 3   Lng                                      9888 non-null   float64
 4   Population                               9888 non-null   int64  
 5   Children                                 9888 non-null   int64  
 6   Age                                      9888 non-null   int64  
 7   Income                                   9888 non-null   float64
 8   Outage_sec_perweek                       9888 non-null   float64
 9   Email                                    9888 non-null   int64  
 10  Contacts                                 9888

In [5]:
new_df.to_csv("clean_d209_2.csv")

In [6]:
X = new_df.drop(["Churn_Yes"],1)
y = new_df["Churn_Yes"]

feature_names = X.columns

skbest = SelectKBest(score_func = f_classif, k='all')
X_new = skbest.fit_transform(X, y)

p_values = pd.DataFrame({'Feature': X.columns, 'p_value':skbest.pvalues_}).sort_values('p_value')
p_values[p_values['p_value'] < .05]

Unnamed: 0,Feature,p_value
0,CaseOrder,0.0
56,Churn_No,0.0
14,Bandwidth_GB_Year,0.0
12,Tenure,0.0
13,MonthlyCharge,5.4e-322
84,StreamingMovies_Yes,5.139599e-189
83,StreamingMovies_No,5.139599e-189
59,Contract_Month-to-month,8.886073e-162
82,StreamingTV_Yes,1.190927e-118
81,StreamingTV_No,1.190927e-118


In [7]:
features_to_keep = p_values['Feature'][p_values['p_value'] < .05]
# Print the name of the selected features
features_to_keep.tolist()

['CaseOrder',
 'Churn_No',
 'Bandwidth_GB_Year',
 'Tenure',
 'MonthlyCharge',
 'StreamingMovies_Yes',
 'StreamingMovies_No',
 'Contract_Month-to-month',
 'StreamingTV_Yes',
 'StreamingTV_No',
 'Contract_Two Year',
 'Contract_One year',
 'Multiple_No',
 'Multiple_Yes',
 'InternetService_DSL',
 'Techie_No',
 'Techie_Yes',
 'InternetService_Fiber Optic',
 'DeviceProtection_Yes',
 'DeviceProtection_No',
 'OnlineBackup_Yes',
 'OnlineBackup_No',
 'InternetService_None',
 'PaymentMethod_Electronic Check',
 'Phone_No',
 'Phone_Yes']

In [8]:
new_df = pd.DataFrame().assign(
Bandwidth_GB_Year=new_df['Bandwidth_GB_Year'],
 MonthlyCharge=new_df['MonthlyCharge'],
 Tenure=new_df[ 'Tenure'],
 StreamingMovies_Yes=new_df[ 'StreamingMovies_Yes'],
 Contract_Month_to_month=new_df['Contract_Month-to-month'],
 StreamingTV_Yes=new_df[ 'StreamingTV_Yes'],
 Contract_One_year=new_df[ 'Contract_One year'],
 Multiple_Yes=new_df[ 'Multiple_Yes'],
 InternetService_DSL=new_df[ 'InternetService_DSL'],
 Techie_Yes=new_df[ 'Techie_Yes'],
 InternetService_Fiber_Optic=new_df[ 'InternetService_Fiber Optic'],
 DeviceProtection_Yes=new_df[ 'DeviceProtection_Yes'],
 OnlineBackup_Yes=new_df[ 'OnlineBackup_No'],
 PaymentMethod_Electronic_Check=new_df[ 'PaymentMethod_Electronic Check'],
 Phone_Yes=new_df[ 'Phone_No'])

In [9]:
vif_data = pd.DataFrame()
vif_data["feature"] = new_df.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(new_df.values, i)
                          for i in range(len(new_df.columns))]
print(vif_data)

                           feature          VIF
0                Bandwidth_GB_Year  1385.326273
1                    MonthlyCharge    52.393213
2                           Tenure  1080.364940
3              StreamingMovies_Yes     5.007248
4          Contract_Month_to_month     3.134521
5                  StreamingTV_Yes     4.682157
6                Contract_One_year     1.829678
7                     Multiple_Yes     2.951217
8              InternetService_DSL     7.418001
9                       Techie_Yes     1.202585
10     InternetService_Fiber_Optic     4.909326
11            DeviceProtection_Yes     2.171166
12                OnlineBackup_Yes     2.185680
13  PaymentMethod_Electronic_Check     1.508921
14                       Phone_Yes     1.104523


In [10]:
# remove variables with multicolinearity > 5
new_df = new_df.drop(['Bandwidth_GB_Year','MonthlyCharge','StreamingMovies_Yes','InternetService_DSL'],axis = 1)
vif_data = pd.DataFrame()
vif_data["feature"] = new_df.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(new_df.values, i)
                          for i in range(len(new_df.columns))]
print(vif_data)

                           feature       VIF
0                           Tenure  2.249796
1          Contract_Month_to_month  2.402912
2                  StreamingTV_Yes  1.810730
3                Contract_One_year  1.552797
4                     Multiple_Yes  1.712093
5                       Techie_Yes  1.183183
6      InternetService_Fiber_Optic  1.662932
7             DeviceProtection_Yes  1.660620
8                 OnlineBackup_Yes  1.965003
9   PaymentMethod_Electronic_Check  1.437789
10                       Phone_Yes  1.096674


In [11]:
X = new_df
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9888 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Tenure                          9888 non-null   float64
 1   Contract_Month_to_month         9888 non-null   uint8  
 2   StreamingTV_Yes                 9888 non-null   uint8  
 3   Contract_One_year               9888 non-null   uint8  
 4   Multiple_Yes                    9888 non-null   uint8  
 5   Techie_Yes                      9888 non-null   uint8  
 6   InternetService_Fiber_Optic     9888 non-null   uint8  
 7   DeviceProtection_Yes            9888 non-null   uint8  
 8   OnlineBackup_Yes                9888 non-null   uint8  
 9   PaymentMethod_Electronic_Check  9888 non-null   uint8  
 10  Phone_Yes                       9888 non-null   uint8  
dtypes: float64(1), uint8(10)
memory usage: 251.1 KB


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
train_size = 0.8, test_size=0.2, random_state=15, stratify=y)

In [13]:
X_train.to_csv('X_train.csv')
X_test.to_csv('X_test.csv')
y_train.to_csv('y_train.csv')
y_test.to_csv('y_test.csv')

In [14]:
param_dist = {'n_estimators': [10,50,100],
              'max_features':[2,3,4],
              'max_depth': [8,None]}

rfc = RandomForestClassifier()

rand_search = RandomizedSearchCV(rfc, 
                                 param_distributions = param_dist, 
                                 n_iter=5, 
                                 cv=5)

rand_search.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=5,
                   param_distributions={'max_depth': [8, None],
                                        'max_features': [2, 3, 4],
                                        'n_estimators': [10, 50, 100]})

In [15]:
print('Best hyperparameters:',  rand_search.best_params_)

Best hyperparameters: {'n_estimators': 50, 'max_features': 4, 'max_depth': 8}


In [16]:
best_rf = rand_search.best_estimator_
y_pred = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8367037411526794


In [17]:
print("Mean square error: ",MSE(y_test,y_pred))

Mean square error:  0.16329625884732052
