In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()

In [None]:
df.info()  #Basic informations

In [None]:
#checking which are categorical columns

for feature in df.columns:
    if df[feature].dtype not in ['int64', 'float64']:    #dtype means datatype
        print(f"{feature}:{df[feature].unique()}")

In [None]:
#changing total charges column to float since total charges actually looks like number ;)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors = 'coerce')
df = df.dropna()  #Dropping null values
df = df.drop(['customerID'], axis = 1)  #dropping customer id because it is irrelevant for modelling

df.info()

In [None]:
#encoding categorical variables to numeric ones
from sklearn.preprocessing import LabelEncoder
for c in df.columns:
    if df[c].dtype=='object':    #Since we are encoding object datatype to integer/float
        lbl = LabelEncoder()
        lbl.fit(list(df[c].values))
        df[c] = lbl.transform(df[c].values)

In [None]:
df.head()  #To check if properly encoded

In [None]:
df.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.figure(figsize = (18,10))

sns.heatmap(df.corr(), annot =True)   #Basic correlation plot to understand which features are correlated

In [None]:
df.hist(figsize=(20,16))
plt.show()  #showing the charts of different columns
#This also helps in finding number of counts in each column

Data is highly imbalanced since Customer churn column has 1 values less than 2000, and 0 value more than 5000. Imbalance is same for phone service and online backup!

In [None]:
zero  = df[df['Churn']==0]   #zero values in outcome column
one = df[df['Churn']==1]  # one values in outcome column
from sklearn.utils import resample
df_minority_upsampled = resample(one, replace = True, n_samples = 5000) 
#concatenate
df = pd.concat([zero, df_minority_upsampled])

from sklearn.utils import shuffle
df = shuffle(df) # shuffling so that there is particular sequence

Churn column handled, let's do it for Phoneservice

In [None]:
zero  = df[df['PhoneService']==0]   #zero values in outcome column
one = df[df['PhoneService']==1]  # one values in outcome column
from sklearn.utils import resample
df_minority_upsampled = resample(zero, replace = True, n_samples = 6000) 
#concatenate
df = pd.concat([one, df_minority_upsampled])

from sklearn.utils import shuffle
df = shuffle(df) # shuffling so that there is particular sequence

In [None]:
df.hist(figsize=(20,16))
plt.show()

In [None]:
#Checking which columns are mostly correlated with the target
df.corr().abs()['Churn'].sort_values(ascending = False)

In [None]:
X = df[['Contract', 'tenure', 'TechSupport', 'OnlineSecurity', 'TotalCharges', 'PaperlessBilling',
       'DeviceProtection', 'Dependents', 'OnlineBackup', 'SeniorCitizen', 'MonthlyCharges',
       'PaymentMethod', 'Partner', 'PhoneService']] #taking only relevant columns
y = df['Churn']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from xgboost import XGBClassifier
xgb =XGBClassifier(eval_metric = 'auc', use_label_encoder=False, objective = 'binary:logistic')
#eval metric is auc because for classification auc metric is best
#XGBoost
#parameters for xgboost

params_xgb = {'n_estimators': [100,400,800], 'learning_rate': [0.3,0.5,1],
             'max_depth': [6,8,15]}
gs_xgb =  GridSearchCV(xgb, param_grid=params_xgb, cv=5)
gs_xgb.fit(X_train, y_train)
print("Best parameters for XGBoost:", gs_xgb.best_params_)


In [None]:
#got the best parameters above
xgb = XGBClassifier(learning_rate = 0.3, max_depth = 25, n_estimators = 400,
                    eval_metric = 'auc',
                    use_label_encoder=False, 
                    objective = 'binary:logistic', random_state = 42)
xgb.fit(X_train, y_train)


In [None]:
y_pred = xgb.predict(X_test)
#printing the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

# 94% accuracy XGBOOST

## Do upvote if you like it or fork it, this helps us to get motivated and work more :)