In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
%matplotlib inline

In [3]:
df=pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [4]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
sns.countplot(data=df,x='Churn')

In [None]:
df.describe().transpose()

In [None]:
df.info()

## Data Visualization

In [None]:
df.groupby('Churn')['gender'].value_counts()

In [None]:
sns.countplot(x='Churn',data=df,hue='gender',palette="coolwarm_r")

The above graph shows up that gender is not a factor for churning of customers.

In [None]:
sns.countplot(x='Churn',data=df, hue='InternetService')

From the above graph, we can see that the customers using Fiber Optics have churned the most.

In [None]:
sns.countplot(x='SeniorCitizen',data=df, hue='Churn')

In [None]:
sns.countplot(x='Partner',data=df, hue='Churn')

From the above graph, we can see that those who don't have partners have churned more than those who do.

In [None]:
sns.countplot(x='OnlineBackup',data=df, hue='Churn',palette='magma')

We see that those who didn't have online backup churned more than those that did.

In [None]:
sns.countplot(x='TechSupport',data=df, hue='Churn',palette='viridis')

From the above graph, we can see that those who don't have tech support have churned more than those who do.

In [None]:
df.info()

In [None]:
df['tenure'].describe()

In [None]:
ax = sns.histplot(x='tenure',hue='Churn', data = df,multiple='dodge')
ax.set(xlabel="Tenure in Months", ylabel = "Count")

In [None]:
ax=sns.histplot(x='MonthlyCharges',hue='Churn',data=df,multiple='dodge')
ax.set(xlabel="Monthly Charges in Dollars", ylabel = "Count")

In [None]:
pd.to_numeric(df['TotalCharges'],errors='coerce')

In [None]:
df[df['TotalCharges']==' ']

In [None]:
df['TotalCharges']=[20.2 if i==' ' else i for i in df['TotalCharges']]

In [None]:
df.info()

In [None]:
df['TotalCharges']=pd.to_numeric(df['TotalCharges'])

In [None]:
df.info()

In [None]:
df_copy=df.iloc[:,1:]

In [None]:
conversion={'Yes':'1',
            'No':'0'}

In [None]:
df_copy['Churn']=df_copy['Churn'].map(conversion)
df_copy['Partner']=df_copy['Partner'].map(conversion)
df_copy['Dependents']=df_copy['Dependents'].map(conversion)
df_copy['PhoneService']=df_copy['PhoneService'].map(conversion)
df_copy['PaperlessBilling']=df_copy['PaperlessBilling'].map(conversion)

In [None]:
df_copy['Churn']=pd.to_numeric(df_copy['Churn'])
df_copy['Partner']=pd.to_numeric(df_copy['Partner'])
df_copy['Dependents']=pd.to_numeric(df_copy['Dependents'])
df_copy['PhoneService']=pd.to_numeric(df_copy['PhoneService'])
df_copy['PaperlessBilling']=pd.to_numeric(df_copy['PaperlessBilling'])

In [None]:
df_copy.head()

In [None]:
df_copy.info()

In [None]:
df_copy['Contract'].value_counts()
df_copy['PaymentMethod'].value_counts()
df_copy['InternetService'].value_counts()
df_copy['MultipleLines'].value_counts()
df_copy['OnlineBackup'].value_counts()
df_copy['OnlineSecurity'].value_counts()
df_copy['TechSupport'].value_counts()
df_copy['StreamingMovies'].value_counts()
df_copy['StreamingTV'].value_counts()
df_copy['DeviceProtection'].value_counts()
df_copy['gender'].value_counts()

In [None]:
#gender=pd.get_dummies(df['gender'],prefix='sex',drop_first=True)
#contract=pd.get_dummies(df['Contract'],prefix='contract',drop_first=True)
#payment=pd.get_dummies(df['PaymentMethod'],prefix='payment',drop_first=True)
#internet=pd.get_dummies(df['InternetService'],prefix='internet',drop_first=True)
#mul_lines=pd.get_dummies(df['MultipleLines'],prefix='mul_lines',drop_first=True)
#onlineb=pd.get_dummies(df['OnlineBackup'],prefix='bkup',drop_first=True)
#onlines=pd.get_dummies(df['OnlineSecurity'],prefix='security',drop_first=True)
#techs=pd.get_dummies(df['TechSupport'],prefix='tech',drop_first=True)
#movies=pd.get_dummies(df['StreamingMovies'],prefix='movies',drop_first=True)
#tv=pd.get_dummies(df['StreamingTV'],prefix='tv',drop_first=True)
#protecc=pd.get_dummies(df['DeviceProtection'],prefix='protection',drop_first=True)

In [None]:
#df=pd.concat([df,gender,contract,payment,internet,mul_lines,onlineb,onlines,techs,movies,tv,protecc],axis=1)

In [None]:
df.head()

In [None]:
df_copy=pd.get_dummies(df_copy,drop_first=True)
df_copy.head()

In [None]:
df_copy.columns.values

In [None]:
plt.figure(figsize=(12,6))
df_copy.corr()['Churn'].sort_values(ascending=False)

In [None]:
df

In [None]:
df_copy

In [None]:
len(df)

In [None]:
(df['InternetService'].value_counts()*100/len(df)).plot(kind='bar')

In [None]:
df_copy.info()

In [None]:
X=df_copy.drop('Churn',axis=1)
y=df_copy['Churn']

In [None]:
X

In [None]:
X.columns.values

In [None]:
from sklearn.preprocessing import MinMaxScaler
features = X.columns.values
scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X))
X.columns = features

In [None]:
X.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=41)

In [None]:
from sklearn.linear_model import LogisticRegression 
logreg=LogisticRegression()

In [None]:
logreg.fit(X_train,y_train)

In [None]:
prediction_logreg=logreg.predict(X_test)
print('Accuracy Score LogReg:',accuracy_score(y_test,prediction_logreg))

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_c=RandomForestClassifier()

In [None]:
param_grid={'n_estimators':[int(x) for x in np.linspace(start=200,stop=1200,num=11)],
            'max_features':['auto','sqrt'],
            'max_depth':[int(x) for x in np.linspace(start=10,stop=100,num=11)],
            'min_samples_leaf':[1,2,3,5],
            'min_samples_split':[2,5,10,15]}

In [None]:
random_cv=RandomizedSearchCV(rf_c,param_grid,cv=3,verbose=2,random_state=42)

In [None]:
random_cv.fit(X_train,y_train)

In [None]:
best_random=random_cv.best_estimator_
print(best_random)
prediction_cv=best_random.predict(X_test)
print(confusion_matrix(y_test,prediction_cv))
print('Accuracy Score RF:',accuracy_score(y_test,prediction_cv))

In [None]:
print('Confusion Matrix:')
print(confusion_matrix(y_test,prediction_cv))

In [None]:
importances = best_random.feature_importances_
weights = pd.Series(importances,
                 index=X.columns.values)
weights.sort_values()[-10:].plot(kind = 'barh')

In [None]:
from xgboost import XGBClassifier
xgb_model = XGBClassifier()

In [None]:
xgb_model.fit(X_train, y_train)

In [None]:
prediction_xgb = xgb_model.predict(X_test)
#print(confusion_matrix(y_test,prediction_xgb))
print('Accuracy Score XGB:',accuracy_score(y_test, prediction_xgb))