In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Telco Churn Prediction

Telecom customer churn prediction based on XGBoost

## Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Importing the dataset

In [None]:
telco_cust = pd.read_csv('../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

## Exploring the dataset

In [None]:
telco_cust.info()

In [None]:
telco_cust.head()

In [None]:
# count numbers of unique values in every columns
telco_cust.nunique()

## Handle missing values

In [None]:
# Converting Total Charges to a numerical data type.
telco_cust.TotalCharges = pd.to_numeric(telco_cust.TotalCharges, errors='coerce')
telco_cust.isnull().sum()

So there are 11 missing values for Total Charges

In [None]:
# Remove missing values 
telco_cust.dropna(inplace = True)

In [None]:
# recheck that we don't have missing values
telco_cust.isnull().sum()

In [None]:
# drop column that don't use ; customerID
dataset = telco_cust.drop('customerID',axis = 1)

In [None]:
dataset.head()

## Handle Categorical Variables

In [None]:
dataset['Churn'].replace(to_replace='Yes', value=1, inplace=True)
dataset['Churn'].replace(to_replace='No',  value=0, inplace=True)

dummies = pd.get_dummies(dataset)
dummies.head()

## Visualising the data

- draws conclusion to make the model

In [None]:
# correlation between every variables
plt.figure(figsize=(9,5))
sns.heatmap(dataset.corr(), annot=True)

In [None]:
# gender
sns.countplot(x='gender', data=dataset)

In [None]:
# SeniorCitizen
sns.countplot(x='SeniorCitizen', data=dataset)

In [None]:
# Partner
sns.countplot(x='Partner', data=dataset)

In [None]:
# Dependents
sns.countplot(x='Dependents', data=dataset)

In [None]:
# tenure
sns.histplot(data=dataset, x="tenure")

In [None]:
# PhoneService
sns.countplot(x='PhoneService', data=dataset)

In [None]:
# MultipleLines
sns.countplot(x='MultipleLines', data=dataset)

In [None]:
# InternetService
sns.countplot(x='InternetService', data=dataset)

In [None]:
# OnlineSecurity
sns.countplot(x='OnlineSecurity', data=dataset)

In [None]:
# OnlineBackup
sns.countplot(x='OnlineBackup', data=dataset)

In [None]:
# DeviceProtection
sns.countplot(x='DeviceProtection', data=dataset)

In [None]:
# TechSupport
sns.countplot(x='TechSupport', data=dataset)

In [None]:
# StreamingTV
sns.countplot(x='StreamingTV', data=dataset)

In [None]:
# StreamingMovies
sns.countplot(x='StreamingMovies', data=dataset)

In [None]:
# Contract
sns.countplot(x='Contract', data=dataset)

In [None]:
# PaperlessBilling
sns.countplot(x='PaperlessBilling', data=dataset)

In [None]:
# PaymentMethod
plt.figure(figsize=(10,5))
sns.countplot(x='PaymentMethod', data=dataset)

In [None]:
# MonthlyCharges
sns.histplot(data=dataset, x="MonthlyCharges")

In [None]:
# TotalCharges
sns.histplot(data=dataset, x="TotalCharges")

In [None]:
# Churn
sns.countplot(x='Churn', data=dataset)

## Splitting the dataset into the Training set and Test set

In [None]:
X = dummies.drop(columns = ['Churn'])
y = dummies['Churn'].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state = 42)

## Training XGBoost on the Training set

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

## Applying k-Fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))