In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns   
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split , cross_val_score ,GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier ,RandomForestClassifier

In [2]:
data = pd.read_csv("Churn.csv")

In [3]:
data.duplicated().sum()

0

In [4]:
data.isna().sum()

Account_Length    0
Vmail_Message     0
Day_Mins          0
Eve_Mins          0
Night_Mins        0
Intl_Mins         0
CustServ_Calls    0
Churn             0
Intl_Plan         0
Vmail_Plan        0
Day_Calls         0
Day_Charge        0
Eve_Calls         0
Eve_Charge        0
Night_Calls       0
Night_Charge      0
Intl_Calls        0
Intl_Charge       0
State             0
Area_Code         0
Phone             0
dtype: int64

In [5]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Account_Length,3333.0,101.064806,39.822106,1.0,74.0,101.0,127.0,243.0
Vmail_Message,3333.0,8.09901,13.688365,0.0,0.0,0.0,20.0,51.0
Day_Mins,3333.0,179.775098,54.467389,0.0,143.7,179.4,216.4,350.8
Eve_Mins,3333.0,200.980348,50.713844,0.0,166.6,201.4,235.3,363.7
Night_Mins,3333.0,200.872037,50.573847,23.2,167.0,201.2,235.3,395.0
Intl_Mins,3333.0,10.237294,2.79184,0.0,8.5,10.3,12.1,20.0
CustServ_Calls,3333.0,1.562856,1.315491,0.0,1.0,1.0,2.0,9.0
Day_Calls,3333.0,100.435644,20.069084,0.0,87.0,101.0,114.0,165.0
Day_Charge,3333.0,30.562307,9.259435,0.0,24.43,30.5,36.79,59.64
Eve_Calls,3333.0,100.114311,19.922625,0.0,87.0,100.0,114.0,170.0


In [6]:
data["State"].unique()

array(['KS', 'OH', 'NJ', 'OK', 'AL', 'MA', 'MO', 'LA', 'WV', 'IN', 'RI',
       'IA', 'MT', 'NY', 'ID', 'VT', 'VA', 'TX', 'FL', 'CO', 'AZ', 'SC',
       'NE', 'WY', 'HI', 'IL', 'NH', 'GA', 'AK', 'MD', 'AR', 'WI', 'OR',
       'MI', 'DE', 'UT', 'CA', 'MN', 'SD', 'NC', 'WA', 'NM', 'NV', 'DC',
       'KY', 'ME', 'MS', 'TN', 'PA', 'CT', 'ND'], dtype=object)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Account_Length  3333 non-null   int64  
 1   Vmail_Message   3333 non-null   int64  
 2   Day_Mins        3333 non-null   float64
 3   Eve_Mins        3333 non-null   float64
 4   Night_Mins      3333 non-null   float64
 5   Intl_Mins       3333 non-null   float64
 6   CustServ_Calls  3333 non-null   int64  
 7   Churn           3333 non-null   object 
 8   Intl_Plan       3333 non-null   object 
 9   Vmail_Plan      3333 non-null   object 
 10  Day_Calls       3333 non-null   int64  
 11  Day_Charge      3333 non-null   float64
 12  Eve_Calls       3333 non-null   int64  
 13  Eve_Charge      3333 non-null   float64
 14  Night_Calls     3333 non-null   int64  
 15  Night_Charge    3333 non-null   float64
 16  Intl_Calls      3333 non-null   int64  
 17  Intl_Charge     3333 non-null   f

In [8]:
data.columns

Index(['Account_Length', 'Vmail_Message', 'Day_Mins', 'Eve_Mins', 'Night_Mins',
       'Intl_Mins', 'CustServ_Calls', 'Churn', 'Intl_Plan', 'Vmail_Plan',
       'Day_Calls', 'Day_Charge', 'Eve_Calls', 'Eve_Charge', 'Night_Calls',
       'Night_Charge', 'Intl_Calls', 'Intl_Charge', 'State', 'Area_Code',
       'Phone'],
      dtype='object')

In [9]:
minss =[ 'Day_Mins', 'Eve_Mins', 'Night_Mins',
       'Intl_Mins']

In [10]:
data[minss]

Unnamed: 0,Day_Mins,Eve_Mins,Night_Mins,Intl_Mins
0,265.1,197.4,244.7,10.0
1,161.6,195.5,254.4,13.7
2,243.4,121.2,162.6,12.2
3,299.4,61.9,196.9,6.6
4,166.7,148.3,186.9,10.1
...,...,...,...,...
3328,156.2,215.5,279.1,9.9
3329,231.1,153.4,191.3,9.6
3330,180.8,288.8,191.9,14.1
3331,213.8,159.6,139.2,5.0


In [11]:
data["total_mins"] = data[minss].sum(axis=1)

In [12]:
data["average_mins"]=data[minss].mean(axis=1)

In [13]:
calls=['Day_Calls','Eve_Calls', 'Night_Calls', 'Intl_Calls']

In [14]:
calls

['Day_Calls', 'Eve_Calls', 'Night_Calls', 'Intl_Calls']

In [15]:
data["average_calls"]=data[calls].mean(axis=1)

In [16]:
data["total_calls"]=data[calls].sum(axis=1)

In [17]:
charge = ['Day_Charge','Eve_Charge',
       'Night_Charge','Intl_Charge']

In [18]:
charge

['Day_Charge', 'Eve_Charge', 'Night_Charge', 'Intl_Charge']

In [19]:
data["mean_charge"]=data[charge].mean(axis=1)

In [20]:
data["total_charge"] = data[charge].sum(axis=1)  

In [21]:
data.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3323,3324,3325,3326,3327,3328,3329,3330,3331,3332
Account_Length,128,107,137,84,75,118,121,147,117,141,...,117,159,78,96,79,192,68,28,184,74
Vmail_Message,25,26,0,0,0,0,24,0,0,37,...,0,0,0,0,0,36,0,0,0,25
Day_Mins,265.1,161.6,243.4,299.4,166.7,223.4,218.2,157.0,184.5,258.6,...,118.4,169.8,193.4,106.6,134.7,156.2,231.1,180.8,213.8,234.4
Eve_Mins,197.4,195.5,121.2,61.9,148.3,220.6,348.5,103.1,351.6,222.0,...,249.3,197.7,116.9,284.8,189.7,215.5,153.4,288.8,159.6,265.9
Night_Mins,244.7,254.4,162.6,196.9,186.9,203.9,212.6,211.8,215.8,326.4,...,227.0,193.7,243.3,178.9,221.4,279.1,191.3,191.9,139.2,241.4
Intl_Mins,10.0,13.7,12.2,6.6,10.1,6.3,7.5,7.1,8.7,11.2,...,13.6,11.6,9.3,14.9,11.8,9.9,9.6,14.1,5.0,13.7
CustServ_Calls,1,1,0,2,3,0,3,0,1,0,...,5,1,2,1,2,2,3,2,2,0
Churn,no,no,no,no,no,no,no,no,no,no,...,yes,no,no,no,no,no,no,no,no,no
Intl_Plan,no,no,no,yes,yes,yes,no,yes,no,yes,...,no,no,no,no,no,no,no,no,yes,no
Vmail_Plan,yes,yes,no,no,no,no,yes,no,no,yes,...,no,no,no,no,no,yes,no,no,no,yes


In [22]:
data =data.drop("Phone",axis=1)

In [23]:
data.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3323,3324,3325,3326,3327,3328,3329,3330,3331,3332
Account_Length,128,107,137,84,75,118,121,147,117,141,...,117,159,78,96,79,192,68,28,184,74
Vmail_Message,25,26,0,0,0,0,24,0,0,37,...,0,0,0,0,0,36,0,0,0,25
Day_Mins,265.1,161.6,243.4,299.4,166.7,223.4,218.2,157.0,184.5,258.6,...,118.4,169.8,193.4,106.6,134.7,156.2,231.1,180.8,213.8,234.4
Eve_Mins,197.4,195.5,121.2,61.9,148.3,220.6,348.5,103.1,351.6,222.0,...,249.3,197.7,116.9,284.8,189.7,215.5,153.4,288.8,159.6,265.9
Night_Mins,244.7,254.4,162.6,196.9,186.9,203.9,212.6,211.8,215.8,326.4,...,227.0,193.7,243.3,178.9,221.4,279.1,191.3,191.9,139.2,241.4
Intl_Mins,10.0,13.7,12.2,6.6,10.1,6.3,7.5,7.1,8.7,11.2,...,13.6,11.6,9.3,14.9,11.8,9.9,9.6,14.1,5.0,13.7
CustServ_Calls,1,1,0,2,3,0,3,0,1,0,...,5,1,2,1,2,2,3,2,2,0
Churn,no,no,no,no,no,no,no,no,no,no,...,yes,no,no,no,no,no,no,no,no,no
Intl_Plan,no,no,no,yes,yes,yes,no,yes,no,yes,...,no,no,no,no,no,no,no,no,yes,no
Vmail_Plan,yes,yes,no,no,no,no,yes,no,no,yes,...,no,no,no,no,no,yes,no,no,no,yes


In [24]:
data["Area_Code"].value_counts()

415    1655
510     840
408     838
Name: Area_Code, dtype: int64

In [25]:
data["Churn"] = data["Churn"].map({"no":0 ,"yes":1})

In [26]:
data["Intl_Plan"] = data["Intl_Plan"].map({"no":0 ,"yes":1})

In [27]:
data["Vmail_Plan"] = data["Vmail_Plan"].map({"no":0 ,"yes":1})

In [28]:
data[["Churn","Intl_Plan","Vmail_Plan"]]

Unnamed: 0,Churn,Intl_Plan,Vmail_Plan
0,0,0,1
1,0,0,1
2,0,0,0
3,0,1,0
4,0,1,0
...,...,...,...
3328,0,0,1
3329,0,0,0
3330,0,0,0
3331,0,1,0


In [29]:
lc=LabelEncoder()

In [30]:
data.loc[:,"State"] = lc.fit_transform(data['State'])

  data.loc[:,"State"] = lc.fit_transform(data['State'])


In [31]:
data["State"].value_counts()

49    106
23     84
34     83
1      80
48     78
35     78
37     78
50     77
45     77
6      74
22     73
13     73
46     73
43     72
44     72
15     71
20     70
16     70
27     68
31     68
26     68
5      66
33     66
47     66
39     65
19     65
25     65
3      64
9      63
24     63
32     62
21     62
28     62
29     61
36     61
8      61
40     60
41     60
17     59
14     58
30     56
2      55
10     54
7      54
11     53
42     53
0      52
18     51
38     45
12     44
4      34
Name: State, dtype: int64

In [32]:
data.dtypes

Account_Length      int64
Vmail_Message       int64
Day_Mins          float64
Eve_Mins          float64
Night_Mins        float64
Intl_Mins         float64
CustServ_Calls      int64
Churn               int64
Intl_Plan           int64
Vmail_Plan          int64
Day_Calls           int64
Day_Charge        float64
Eve_Calls           int64
Eve_Charge        float64
Night_Calls         int64
Night_Charge      float64
Intl_Calls          int64
Intl_Charge       float64
State               int64
Area_Code           int64
total_mins        float64
average_mins      float64
average_calls     float64
total_calls         int64
mean_charge       float64
total_charge      float64
dtype: object

In [33]:
data.columns

Index(['Account_Length', 'Vmail_Message', 'Day_Mins', 'Eve_Mins', 'Night_Mins',
       'Intl_Mins', 'CustServ_Calls', 'Churn', 'Intl_Plan', 'Vmail_Plan',
       'Day_Calls', 'Day_Charge', 'Eve_Calls', 'Eve_Charge', 'Night_Calls',
       'Night_Charge', 'Intl_Calls', 'Intl_Charge', 'State', 'Area_Code',
       'total_mins', 'average_mins', 'average_calls', 'total_calls',
       'mean_charge', 'total_charge'],
      dtype='object')

In [34]:
x=data.drop(["Churn"],axis=1)

In [35]:
y= data["Churn"]

In [36]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=134)

In [37]:
ms=MinMaxScaler()

In [38]:
x_train.loc[:,:] = ms.fit_transform(x_train)
x_test.loc[:,:] = ms.fit_transform(x_test)

  x_train.loc[:,:] = ms.fit_transform(x_train)
  x_test.loc[:,:] = ms.fit_transform(x_test)


In [39]:
nb = GaussianNB()

In [40]:
nb.fit(x_train,y_train)
y_pred = nb.predict(x_test)

In [41]:
accuracy_score(y_test,y_pred)

0.8620689655172413

In [42]:
cv_results =cross_val_score(nb,x_train,y_train,cv=5)

In [43]:
cv_results.mean()

0.8480834229258456

In [44]:
lda = LinearDiscriminantAnalysis()

In [45]:
cv_results =cross_val_score(lda,x_train,y_train,cv=5)

In [46]:
cv_results.mean()

0.8525841291256475

In [47]:
lr= LogisticRegression()

In [48]:
cv_results =cross_val_score(lr,x_train,y_train,cv=5)

In [49]:
cv_results.mean()

0.8649619495330649

In [50]:
sv=SVC()

In [51]:
cv_results =cross_val_score(sv,x_train,y_train,cv=5)
cv_results.mean()

0.9099774437675233

In [52]:
dt = DecisionTreeClassifier()

In [53]:
cv_results = cross_val_score(dt, x, y, cv = 5, scoring="accuracy")
cv_results.mean()

0.9492956224590406

In [54]:
bg = BaggingClassifier(base_estimator=dt, n_estimators= 200)

In [55]:
cv_results = cross_val_score(bg, x, y, cv = 5, scoring="accuracy")
cv_results.mean()



0.9765986376181279

In [56]:
rf =RandomForestClassifier()

In [57]:
cv_results = cross_val_score(rf, x, y, cv = 5, scoring="accuracy")
cv_results.mean()

0.977499088293691

In [58]:
params_rf = {'n_estimators': [50,100,150],
             'max_depth': [8, 16,24],           
             'max_features': ['log2', 'sqrt']}

In [59]:
grid_rf = GridSearchCV(estimator=rf, param_grid=params_rf, cv = 5, scoring="accuracy")

In [60]:
grid_rf.fit(x_train, y_train)

In [61]:
grid_rf.best_params_

{'max_depth': 24, 'max_features': 'sqrt', 'n_estimators': 150}

In [62]:
grid_rf.best_score_

0.9748677192908488

In [63]:
best_model = grid_rf.best_estimator_

In [64]:
y_pred = best_model.predict(x_test)

In [65]:
accuracy_score(y_test,y_pred)

0.8620689655172413

In [66]:
cv_results = cross_val_score(best_model, x, y, cv = 5, scoring="accuracy")
cv_results.mean()

0.9768989379184282