In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Details of the probelm statement and data
**Content: **

Each row represents a customer, each column contains customer’s attributes described on the column Metadata.

**The data set includes information about: **

    Customers who left within the last month – the column is called Churn
    Services that each customer has signed up for – phone, multiple lines, internet, online security, online backup, device protection, tech support, and streaming TV and movies
    Customer account information – how long they’ve been a customer, contract, payment method, paperless billing, monthly charges, and total charges
    Demographic info about customers – gender, age range, and if they have partners and dependents
    Inspiration
    To explore this type of models and learn more about the subject.

Lets divide the entire process of Data analysis into steps

    Step 1: Reading and understanding data
    Step 2: Checking for null values and datatypes
    Step 3: Dealing with outliers
    Step 4: Univaraite Analysis
    Step 5: Bivirate Analysis
            a) Categorical Unordered 
            b) Categorical ordered
    Step 6: Multivariate analysis

## Step 1: Reading and understanding data

In [None]:
# importing warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes = True)

In [None]:
# Reading data
telco = pd.read_csv("../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")


In [None]:
# Displaying the head of data
telco.head()

In [None]:
# Checking shape
telco.shape

In [None]:
# info
telco.info()

In [None]:
# Describing data
telco.describe()

In [None]:
# Checking dtypes
telco.dtypes

Here we need to convert the data type of Total Charges to float. However, the while converting entire column we are getting an error. So checking value counts for more information

In [None]:
telco.TotalCharges.value_counts()

There are 11 blank spaces in the Totalcharges column

In [None]:
# We are converting the SeniorCitizen to yes and no for better visualisation
telco.SeniorCitizen = telco.SeniorCitizen.map({1: 'Yes', 0:'No'})

In [None]:
# we are converting all the values in the TotalCharges data to float, 
# However, for blank strings we are going to impute nan ... errors= 'coerce'
telco['TotalCharges'] = pd.to_numeric(telco['TotalCharges'],errors= 'coerce')

In [None]:
# rechecking data types 
telco.dtypes

## Step 2: Checking null values

In [None]:
telco.isnull().sum()

In [None]:
telco.TotalCharges = telco.TotalCharges.replace(to_replace=np.nan, value= telco.TotalCharges.median())

In [None]:
telco.isnull().sum()

There are no null values in our dataset, proceeding further

Now, we are diving the entire data into categorical and numerical columns

In [None]:
# Numerical variable columns
num_cols= telco.describe().columns
telco_numeric = telco[num_cols]
telco_numeric.head()

In [None]:
# Categorical variables
telco_cat = telco.select_dtypes(include = 'object')
telco_cat.head()

### Binning tenure variable for better visualisation

In [None]:
telco['tenure_yrs'] = pd.cut(telco.tenure,bins = [0,15,30,45,60,75], 
                    labels = ['0-15 years', '15-30 years', '30-45 years', '45-60 years', '60 & above'])

## Step 3: Dealing with outliers

In [None]:
#checking outliers for tenure, TotalCharges and MonthlyCharges
plt.figure(figsize=[20,3])
plt.subplot(1,3,1)
sns.boxplot(telco_numeric.tenure)
plt.subplot(1,3,2)
sns.boxplot(telco_numeric.TotalCharges)
plt.subplot(1,3,3)
sns.boxplot(telco_numeric.MonthlyCharges)
plt.show()

As we can see there are no outliers in the numerical data. Proceeding futher for Univariate analysis

## Step 4 : Univaraite analysis

In [None]:
#Distplot for tenure, MonthlyCharges and TotalCharges
plt.figure(figsize=[20,8])
plt.subplot(1,3,1)
sns.distplot(telco_numeric.tenure)
plt.subplot(1,3,2)
sns.distplot(telco_numeric.MonthlyCharges)
plt.subplot(1,3,3)
sns.distplot(telco_numeric.TotalCharges)
plt.show()

In [None]:
# Creating pie chart for Churn categorical variable
plt.figure(figsize=[6,6])
plt.pie(telco.Churn.value_counts(), explode=[0.2,0],
        labels=['No', 'Yes'])
plt.title("Pie chart for Churn")
plt.show()

In [None]:
# Creating pie chart for tenure_yrs categorical variable

plt.figure(figsize=[6,6])
plt.pie(telco.tenure_yrs.value_counts(), explode=[0,0,0,0,0.2],
        labels=['0-15 years', '15-30 years', '30-45 years', '45-60 years', '60 & above'])
plt.title("Pie chart for Tenure year counts")
plt.show()

## Step 5 : Bivariate analysis

In [None]:
# Pair plot for numerical variables
sns.pairplot(data = telco_numeric)
plt.show()

In [None]:
# Joint plot for tenure vs total charges
sns.jointplot(telco.tenure, telco.TotalCharges)
plt.show()

###  a) Categorical Unordered analysis

In [None]:
# barplot for categorical unordered variables with TotalCharges
plt.figure(figsize=[20,15])
plt.subplot(3,3,1)
sns.barplot(x= telco_cat.gender, y = telco_numeric.TotalCharges)
plt.subplot(3,3,2)
sns.barplot(x= telco_cat.PaymentMethod, y = telco_numeric.TotalCharges, hue = telco_cat.PaymentMethod)
plt.xticks([])
plt.subplot(3,3,3)
sns.barplot(x= telco_cat.OnlineBackup, y = telco_numeric.TotalCharges)
plt.subplot(3,3,4)
sns.barplot(x= telco_cat.PhoneService, y = telco_numeric.TotalCharges)
plt.subplot(3,3,5)
sns.barplot(x= telco_cat.InternetService, y = telco_numeric.TotalCharges)
plt.subplot(3,3,6)
sns.barplot(x= telco_cat.OnlineSecurity, y = telco_numeric.TotalCharges)
plt.subplot(3,3,7)
sns.barplot(x= telco_cat.DeviceProtection, y = telco_numeric.TotalCharges)
plt.subplot(3,3,8)
sns.barplot(x= telco_cat.StreamingTV, y = telco_numeric.TotalCharges)
plt.subplot(3,3,9)
sns.barplot(x= telco_cat.MultipleLines, y = telco_numeric.TotalCharges)
plt.show()

As we can see that people who opted for all the services has more charges

In [None]:
# Count plot for categorical unordered variables
plt.figure(figsize=[20,15])
plt.subplot(3,3,1)
sns.countplot(telco.Churn)
plt.subplot(3,3,2)
sns.countplot(telco.TechSupport)
plt.subplot(3,3,3)
sns.countplot(telco.OnlineBackup)
plt.subplot(3,3,4)
sns.countplot(telco.PhoneService)
plt.subplot(3,3,5)
sns.countplot(telco.InternetService)
plt.subplot(3,3,6)
sns.countplot(telco.OnlineSecurity)
plt.subplot(3,3,7)
sns.countplot(telco.DeviceProtection)
plt.subplot(3,3,8)
sns.countplot(telco.StreamingTV)
plt.subplot(3,3,9)
sns.countplot(telco.MultipleLines)
plt.show()

Most of the customers opted for phone Services and Fiber optic Services

In [None]:
# Categorical vs Categorical -- SeniorCitizens vs Techsupport
plt.figure(figsize=[15,5])
plt.subplot(1,3,1)
sns.countplot(telco.TechSupport, hue= telco.SeniorCitizen)
plt.subplot(1,3,2)
sns.countplot(telco.gender, hue= telco.SeniorCitizen)
plt.subplot(1,3,3)
sns.countplot(telco.Dependents, hue= telco.SeniorCitizen)
plt.show()

SeniorCitizens haven't opted for techsupport mostly and do not have any dependents

### b) Categorical ordered Analysis

In [None]:
#tenure_yrs count vs churn
plt.figure(figsize=[12,8])
sns.countplot(telco.tenure_yrs, hue = telco.Churn)
plt.show()

In [None]:
# Contract vs Churn
plt.figure(figsize=[12,8])
sns.countplot(telco.Contract, hue = telco.Churn)
plt.show()

From the above graph, we can clearly see that month-to-month contract people are more likely to churn rather than one year and two year contract type

In [None]:
# Categical vs numericals
plt.figure(figsize=[20,6])
plt.subplot(1,3,1)
sns.barplot(telco.Churn, telco.tenure)
plt.subplot(1,3,2)
sns.barplot(telco.Churn, telco.TotalCharges)
plt.subplot(1,3,3)
sns.barplot(telco.Churn, telco.MonthlyCharges)
plt.show()

Churn is high due to due to monthly charges

In [None]:
# Churn vs SeniorCitizen
sns.countplot(telco.Churn, hue=telco.SeniorCitizen)
plt.show()

We can see that SeniorCitizens are not likely to churn

In [None]:
# barplot for categorical variables with TotalCharges
plt.figure(figsize=[20,15])
plt.subplot(3,3,1)
sns.barplot(x= telco_cat.gender, y = telco_numeric.MonthlyCharges, hue= telco.Churn )
plt.subplot(3,3,2)
sns.barplot(x= telco_cat.Dependents, y = telco_numeric.MonthlyCharges, hue= telco.Churn)
plt.xticks()
plt.subplot(3,3,3)
sns.barplot(x= telco_cat.InternetService, y = telco_numeric.MonthlyCharges, hue= telco.Churn)
plt.subplot(3,3,4)
sns.barplot(x= telco_cat.PhoneService, y = telco_numeric.MonthlyCharges, hue= telco.Churn)
plt.subplot(3,3,5)
sns.barplot(x= telco_cat.OnlineBackup, y = telco_numeric.MonthlyCharges, hue= telco.Churn)
plt.subplot(3,3,6)
sns.barplot(x= telco_cat.OnlineSecurity, y = telco_numeric.MonthlyCharges, hue= telco.Churn)
plt.subplot(3,3,7)
sns.barplot(x= telco_cat.DeviceProtection, y = telco_numeric.MonthlyCharges, hue= telco.Churn)
plt.subplot(3,3,8)
sns.barplot(x= telco_cat.StreamingTV, y = telco_numeric.MonthlyCharges, hue= telco.Churn)
plt.subplot(3,3,9)
sns.barplot(x= telco_cat.MultipleLines, y = telco_numeric.MonthlyCharges, hue= telco.Churn)
plt.show()

We can see that people who opted for phoneservice, onlinebackup, onlinesecurity, Device protection, multiplelines, streamingtv are more likely to churn if the monthly charge is high

## Step 6: Multivariate analysis

In [None]:
# Heatmap for numericaldata
sns.heatmap(telco.corr(), annot=True)
plt.show()

In [None]:
#lmplot for monthlycharges vs tenure_yrs vs tenure
import os
sns.lmplot(x='tenure', y='MonthlyCharges', hue ='tenure_yrs', 
           data=telco.loc[telco['tenure_yrs'].isin(['0-15 years', '15-30 years', '30-45 years', '45-60 years', '60 & above'])], 
           fit_reg=False)
plt.show()


From the above plot, we can see that based on tenurity the monthly charges are increasing, it means they are opting for more services

In [None]:
# relplot with MonthlyCharges vs tenure vs Churn
sns.relplot(x="MonthlyCharges", y="tenure",hue='Churn', data=telco)
plt.show()

If the monthly charges are above 90 and if the customer is <5 years tenure then he/she is more likely to churn

In [None]:
# relplot with TotalCharges vs MonthlyCharges vs Churn
sns.relplot(x="TotalCharges", y="MonthlyCharges",hue='Churn',kind='scatter', data=telco)
plt.show()

We can clearly see that customers are more likely to churn based on the MontlyCharges and not on TotalCharges