In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
df=pd.read_csv("/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")
print(df)

****removing unwanted columns for our prediction****

In [None]:
df.drop(columns=["customerID"],axis=1)

**Checking for missing values**

In [None]:
print(df.isnull().sum())

In [None]:
df.info()

**We can clearly see that TotalCharges are in object type we need to convert it into float**

In [None]:
df["TotalCharges"]=df["TotalCharges"].replace(r' ',0,regex=True)
df["TotalCharges"]=df["TotalCharges"].astype(float)

**You can clearly see the missing values in TotalCharges are converted into 0.
After that we need to replace with mean of the column to get consistent**

In [None]:
df["TotalCharges"].value_counts()

In [None]:
df["TotalCharges"]=df["TotalCharges"].replace(0.00,df["TotalCharges"].mean(),regex=True)
print(df["TotalCharges"])

**As there are no missing values,
 Let's quickly jump into visualization.**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px

In [None]:
fig=px.histogram(df.groupby(["tenure","Churn"]).size().reset_index(name="count"),
                x="tenure",y="count",color="Churn",marginal="rug",color_discrete_map={"Yes":"#E45756","No":"#1CBE4F"},
                title="tech support")
fig.show()

In [None]:
fig=px.sunburst(df.groupby(["Churn","PhoneService","InternetService"]).size().reset_index(name="count"),
               path=["Churn","PhoneService","InternetService"],values="count",title="are we having issue with phone")

fig.show()

*How many customers prefer paperless billing*

In [None]:

fig,ax=plt.subplots(figsize=(12,8))
ax=sns.countplot(x="PaperlessBilling",hue="Churn",data=df)

**Who are likely to cancel our subscription either male or female**

In [None]:
fig,ax=plt.subplots(figsize=(12,8))
ax=sns.countplot(x="InternetService",hue="Churn",data=df)

In [None]:
import seaborn as sns
plt.style.use("seaborn")
color=plt.cm.ocean(np.linspace(0,2,20))
df["PaymentMethod"].value_counts().plot.bar(color=color,figsize=(12,8))
plt.title("Payments preferred",fontsize=20)
plt.xticks(rotation=0)
plt.show()

In [None]:
import seaborn as sns
plt.style.use("seaborn")
color=plt.cm.ocean(np.linspace(0,2,20))
df["Contract"].value_counts().plot.bar(color=color,figsize=(12,8))
plt.title("Type of contract most of the customer's preffered",fontsize=20)
plt.xticks(rotation=0)
plt.show()

**After performing chi2 test we only include the columns we want.**

In [None]:
X=df[["tenure","Contract","OnlineSecurity","TechSupport","OnlineBackup","DeviceProtection","SeniorCitizen","Dependents","PaperlessBilling","Partner","TotalCharges","PaymentMethod","MonthlyCharges"]]
X

In [None]:
Y=df["Churn"]
Y

**Now our X and Y are ready but we to do feature Scaling to understand for our algorithm.**

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

**So, In our independent features there are mixture of continuous and categorical values,
 First we need to scale for Catagorical and then continuous values**

In [None]:
X[["Contract","OnlineSecurity","TechSupport","OnlineBackup","DeviceProtection","Dependents","PaperlessBilling","Partner","PaymentMethod"]]=X[["Contract","OnlineSecurity","TechSupport","OnlineBackup","DeviceProtection","Dependents","PaperlessBilling","Partner","PaymentMethod"]].apply(le.fit_transform)


In [None]:
X

**The feature scaling of categorical values has done.Now ,Let's do on continuous values**

In [None]:
from sklearn.preprocessing import StandardScaler
SC=StandardScaler()

In [None]:
X[["tenure","TotalCharges","MonthlyCharges"]]=SC.fit_transform(X[["tenure","TotalCharges","MonthlyCharges"]])

In [None]:
df[["Churn"]]=df[["Churn"]].apply(le.fit_transform)
Y=df["Churn"]
Y

In [None]:
X=X.iloc[:,:].values
Y=df.iloc[:,-1].values

**Now we had done with our all preprocessing steps. Let's get into training and testing**

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=100)

**Now our traning and testing is ready. Let's get into our model buliding**

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix,accuracy_score
ada=AdaBoostClassifier(n_estimators=50)
ada.fit(X_train,y_train)
pred=ada.predict(X_test)
con_mat=confusion_matrix(y_test,pred)
print("confusion-matrix \n",con_mat)
acc_score=accuracy_score(y_test,pred)
#print(acc_score)

**Now let's apply cross-validation to get average accuracy for 10 models**

In [None]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(ada,X,Y,cv=10)
#print(score)
mean=np.array(score).mean()
print("cross-validation score:",mean)


In [None]:
from sklearn.linear_model import LogisticRegression
log=LogisticRegression()
log.fit(X_train,y_train)
pred=log.predict(X_test)
con_mat=confusion_matrix(y_test,pred)
print("confusion-matrix \n",con_mat)
acc_score=accuracy_score(y_test,pred)
#print(acc_score)


In [None]:
score=cross_val_score(log,X,Y,cv=10)
#print(score)
mean=np.array(score).mean()
print("cross-validation score:",mean)

In [None]:
from sklearn.ensemble import RandomForestClassifier
RFC=RandomForestClassifier()
RFC.fit(X_train,y_train)
pred=RFC.predict(X_test)
con_mat=confusion_matrix(y_test,pred)
print("confusion-matrix \n",con_mat)
acc_score=accuracy_score(y_test,pred)
#print(acc_score)


In [None]:
score=cross_val_score(RFC,X,Y,cv=10)
#print(score)
mean=np.array(score).mean()
print("cross-validation score:",mean)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb.fit(X_train,y_train)
pred=gb.predict(X_test)
con_mat=confusion_matrix(y_test,pred)
print("confusion-matrix \n",con_mat)
acc_score=accuracy_score(y_test,pred)
#print(acc_score)

In [None]:
score=cross_val_score(gb,X,Y,cv=10)
#print(score)
mean=np.array(score).mean()
print("cross-validation score:",mean)

In [None]:
from sklearn.ensemble import VotingClassifier
clf1 = GradientBoostingClassifier()
clf2 = LogisticRegression()
clf3 = AdaBoostClassifier()
eclf1 = VotingClassifier(estimators=[('Gradient', clf1), ('Logistic', clf2), ('AdaBoost', clf3)], voting='soft')
eclf1.fit(X_train, y_train)
pred = eclf1.predict(X_test)
con_mat=confusion_matrix(y_test,pred)
print("confusion-matrix \n",con_mat)
acc_score=accuracy_score(y_test,pred)
#print(acc_score)

In [None]:
score=cross_val_score(eclf1,X,Y,cv=10)
#print(score)
mean=np.array(score).mean()
print("cross-validation score:",mean)

**Models             Accuracy**

**AdaBoostClassifier:  80.13%**

**logisticregression:  80.02%**

**RandomForest      :  79.48%**

**GradientBoosting  :  80.37%**

**VotingClassifier  :  80.70%**