In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px 
import warnings
warnings.filterwarnings('ignore')

In [None]:
data=pd.read_csv('../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
data.head(10)

In [None]:
print("Total rows and Columns in dataSet:", data.shape)

In [None]:
data.nunique()

In [None]:
data.isnull().sum()

In [None]:
# There are no missing values in any of the columns 

In [None]:
data.dtypes

In [None]:
# Only SeniorCitizen , Tenure , Monthly Charges are numerical. Total Charges column is also Object  

In [None]:
data.describe()

In [None]:
# Here Count is same as the no. of rows. 
#There are less outliers. Variance (the spread of Data) is high in Tenure and MonthlyCharges.

In [None]:
#Univariate Analysis

In [None]:
# Separating Categorical and Continous Variable and removing the Customer ID since it has all the Unique Values.
cat=[]
cont=[]
for i in data.columns:
    if data[i].nunique()<10:
        cat.append(i)
    elif data[i].nunique()>10 and i !="customerID":
        cont.append(i)

In [None]:
cat

In [None]:
cont

In [None]:
# Categorical variables are subjected to go through Univariate analysis

In [None]:
def plotvariable(var):
    plt.figure(figsize=(15,8))
    data[var].value_counts().plot.pie(autopct='%1.1f%%')
    centre=plt.Circle((0,0),0.7,fc='white')
    fig=plt.gcf()
    fig.gca().add_artist(centre)

In [None]:
plotvariable('Churn')
# So according to data set 73.5 % of customers do not churn.

In [None]:
plotvariable('gender')
# Gender is balanced

In [None]:
plotvariable('Dependents') 

In [None]:
plotvariable('PhoneService') # Very Less customers are without Phone service 

In [None]:
plotvariable('Partner') # Customers with partners and without partners are almost same

In [None]:
plotvariable('MultipleLines') # Among 90.3 % ,42.2% use MultipleLines

In [None]:
plotvariable('InternetService') 
#Most of the people using Internet service use Fiber optic instead of DSL, while 21.7% donot use internet service

In [None]:
plotvariable('OnlineSecurity') # Max customers dont use Online Security.

In [None]:
plotvariable("OnlineBackup") #Max customers dont use Online backup.

In [None]:
plotvariable("DeviceProtection") 
# majority of the customers who uses internet do not have Device Protection

In [None]:
plotvariable('TechSupport')
#majority of the customers who uses internet do not have TechSupport

In [None]:
plotvariable('StreamingTV')  # there is balance in the users who Watch Streaming Tv and who dont

In [None]:
plotvariable('StreamingMovies')
#there is balance in the users who Watch Streaming Movies and who do not.

In [None]:
plotvariable('Contract')
#Month to month followed by two year plan and one year contract is used by least no. of customers

In [None]:
plotvariable('PaperlessBilling')
#Paperlessbilling is prefferd by the customers

In [None]:
plotvariable('PaymentMethod')
#Most customers use electronic check for making payments, 
#while rest of the methods are used by almost equal no. of customers.

In [None]:
#For Continious variable histogram will be good visualisation 

In [None]:
def plotcontvariable(var):
    plt.figure(figsize=(8,6))
    sns.histplot(data[var],color='b')
    print('Minimum',data[var].min())
    print('Maximum',data[var].max())

In [None]:
plotcontvariable('tenure') 
# Majority of loyal customers tenure lies between 0 and 2 and after that it is 70 years

In [None]:
plotcontvariable('MonthlyCharges')  
#Monthly charges almost follows normal distribution, whith its majority of customers paying monthly charges 19 to 25.

In [None]:
# since it is object datatype it has to changed to float 
data['TotalCharges']=data['TotalCharges'].apply(lambda x: np.NaN if x==' ' else float(x))

In [None]:
plotcontvariable('TotalCharges')  # majority of customers lies between 0 to 500 range 

In [None]:
# Bivariate Analysis 

In [None]:
def plotboxenplot(var):
    plt.figure(figsize=(8,6))
    sns.boxenplot(x='Churn',y=var,data=data)

In [None]:
plotboxenplot('TotalCharges') # Higher paying customers do not churn 

In [None]:
plotboxenplot('MonthlyCharges') # Customers with more monthly charges seems churn away

In [None]:
plotboxenplot('tenure') # Longer tenure customer doesnot seem churn Away

In [None]:
def plotcountplot(var):
    plt.figure(figsize=(8,6))
    sns.countplot(data[var],hue=data['Churn'])

In [None]:
plotcountplot('gender') # same no. of female and male churn 

In [None]:
plotcountplot('SeniorCitizen') # Senior citizen chrn easily

In [None]:
 plotcountplot('Partner') # Customers who do not have partners have higher chances of churning. 

In [None]:
 plotcountplot('Dependents') 
#Customers with dependemts have lower rate of churning than the customers who do not have partners.

In [None]:
  plotcountplot('PhoneService') #people using Phone service are less churning.

In [None]:
plotcountplot('MultipleLines') #People using multiple lines have lower ratio of churning 

In [None]:
plotcountplot('InternetService')  
# Customers using fiber optic internet service are the highest no. of people churning away.

In [None]:
plotcountplot('OnlineSecurity') #Customers who have not subscrobed online security churn the most.

In [None]:
plotcountplot('OnlineBackup') #Customers who have not subscrobed online security churn the most.

In [None]:
plotcountplot('DeviceProtection') 
#Customers who do not opt for device protection have highest no. of churning away

In [None]:
plotcountplot('TechSupport')  #Customers who do not opt for tech support have highest no. of churning away.

In [None]:
plotcountplot('StreamingTV') 
#Customers who have subscribed for streaming tv have slightly lower ratio of churning 

In [None]:
plotcountplot('StreamingMovies')  
#Customers who have subscribed for streaming Movies have slightly lower ratio of churning

In [None]:
plotcountplot('Contract')
#People having shorter contract churn higher than the people who have contract for a longer time

In [None]:
plotcountplot('PaymentMethod')
#People payong bills through electronic check have the highest ratio for churning 

In [None]:
# Phone service vs Other Features
plt.figure(figsize=(10,10))
plt.subplot(3,2,1)
sns.countplot(data['MultipleLines'],hue=data['PhoneService'])

plt.subplot(3,2,2)
sns.countplot(data['OnlineBackup'],hue=data['PhoneService'])

plt.subplot(3,2,3)
sns.countplot(data['OnlineSecurity'],hue=data['PhoneService'])

plt.subplot(3,2,4)
sns.countplot(data['TechSupport'],hue=data['PhoneService'])


In [None]:
# Phone service doesnt seem like important feature. People who dont have internet service are the one who uses Phone Services.

In [None]:
#Internet Service vs other features.
plt.figure(figsize=(10,10))
plt.subplot(3,2,1)
sns.countplot(data['OnlineSecurity'],hue=data['InternetService'])

plt.subplot(3,2,2)
sns.countplot(data['OnlineBackup'],hue=data['InternetService'])

plt.subplot(3,2,3)
sns.countplot(data['DeviceProtection'],hue=data['InternetService'])

plt.subplot(3,2,4)
sns.countplot(data['TechSupport'],hue=data['InternetService'])

plt.subplot(3,2,5)
sns.countplot(data['StreamingMovies'],hue=data['InternetService'])

plt.subplot(3,2,6)
sns.countplot(data['StreamingTV'],hue=data['InternetService'])

#Customers with no Internet Services doesnt use the following services. 

In [None]:
#TotalCharges vs tenure
plt.figure(figsize=(10,10))
plt.subplot(2,2,1)
sns.scatterplot(x='tenure',y='TotalCharges',hue='Churn',data=data,marker='D')

#tenure vs MonthlyCharges
plt.subplot(2,2,2)
sns.scatterplot(x='tenure',y='MonthlyCharges',hue='Churn',data=data,marker='D')

#MonthlyCharges vs TotalCharges
plt.subplot(2,2,3)
sns.scatterplot(x='MonthlyCharges',y='TotalCharges',hue='Churn',data=data,marker='D')

In [None]:
#People having lower total charges and higher monthly charge have a higher ratio of churning than the rest of the population.
#People having higher tenure and higher monthly charges seem to churn faster than the rest.

In [None]:
#Continous variable correlation
plt.figure(figsize=(10,8))
sns.heatmap(data.corr(),annot=True,cmap='Greys')

In [None]:
#There is high correlation between Total charges and tenure.
# Also a high correlation between Monthlly charges and Total Charges.

In [None]:
data.drop("customerID",axis=1,inplace=True)

In [None]:
df=data

In [None]:
df['TotalCharges'].fillna(df['TotalCharges'].mean(),inplace=True)

In [None]:
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder
o=OrdinalEncoder()
l=LabelEncoder()

In [None]:
for i in df.columns:
    if df[i].dtypes=='O' and i!='Churn':
        df[i]=o.fit_transform(df[i].values.reshape(-1,1))

#Using label encoder for Label Column
df['Churn']=l.fit_transform(df['Churn'])

In [None]:
df.head(5)

In [None]:
x=df.copy()
x.drop('Churn',axis=1,inplace=True)
y=df['Churn']

In [None]:
from imblearn.over_sampling import SMOTE
over=SMOTE()

In [None]:
x,y=over.fit_resample(x,y)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
xd=scaler.fit_transform(x)
x=pd.DataFrame(xd,columns=x.columns)

In [None]:
x.head() #after Scaling the DataFrame 

In [None]:
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score

In [None]:
def randomstate(x,y):
    maxx=0
    model=LogisticRegression()
    for i in range(1,201):
        xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.25,random_state=i)
        model.fit(xtrain,ytrain)
        p=model.predict(xtest)
        accu=accuracy_score(p,ytest)
        if accu>maxx:
            maxx=accu
            j=i
    return j

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.25,random_state=randomstate(x,y))

In [None]:
def performance(p,ytest,m,xtest,s):
    print('Accuracy',np.round(accuracy_score(p,ytest),4))
    print('Mean of Cross Validation Score',np.round(s.mean(),4))
    print('AUC_ROC Score',np.round(roc_auc_score(ytest,m.predict_proba(xtest)[:,1]),4))
    print('Confusion Matrix')
    print(confusion_matrix(p,ytest))
    print('Classification Report')
    print(classification_report(p,ytest))


In [None]:
model= RandomForestClassifier()
model.fit(xtrain,ytrain)
p=model.predict(xtest)
score=cross_val_score(model,x,y,cv=10)
performance(p,ytest,model,xtest,score) 

In [None]:
pred=pd.DataFrame(p)
pred.head(10)