In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sb

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **1. Explore the dataset**

1.1 Read in csv file

In [None]:
telco_data=pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

1.2 Take a look at the data 

In [None]:
telco_data.head()

In [None]:
telco_data.info()

In [None]:
telco_data.describe()

In [None]:
telco_data.isnull().sum()

pretty clean

In [None]:
telco_data.dtypes

The type of "TotalCharges" should be float instead of object. 

In [None]:
telco_data[telco_data['TotalCharges'].isin([' '])].info()

In [None]:
telco_data.replace(to_replace=r'^\s*$',value=np.nan,regex=True,inplace=True)
telco_data.dropna(axis=0, how='any', inplace=True)
telco_data['TotalCharges'] = pd.to_numeric(telco_data['TotalCharges'])

In [None]:
telco_data.describe(include = 'all')

 # 2. Exploratory Data Analysis

In [None]:
ax = sb.catplot(y="Churn", kind="count", data=telco_data, height=3, aspect=2.5)

Question: What are the possible reasons for custmer churn? 
* Hypothesis 1(Aspect of services): The services can’t satisfy the customers’ needs.
* Hypothesis 2(Aspect of customers):The customers’ needs have changed. 

2.1 Analysis of Services

In [None]:
ax=sb.countplot(x='PhoneService',hue='Churn',data=telco_data)
ax.set_xticklabels(['Not receiving phone service','Receiving phone service'])

The first picture shows the attrition of those who received telephone service and those who did not. It can be seen that the churn rate is not much different.

In [None]:
ax=sb.countplot(x='InternetService',hue='Churn',data=telco_data,order=['DSL','Fiber optic','No'])
ax.set_xticklabels(['DSL','Fiber optic','Not receiving internet service'])

In the second picture, the churn rate of users who receive network services is significantly higher than that of customers who do not receive services, especially users who use fiber optic network technology. 40%. It is inferred that this service has certain problems and is a service that needs to be improved urgently.Whether the user is a network service user is closely related to whether the loss occurs.
For network services, we can further explore whether having other additional services will affect the loss of users. The same is the way of counting graphs:

In [None]:
fig,axes = plt.subplots(2,2,figsize=(14,12))
sb.countplot(x='OnlineSecurity',hue='Churn',data=telco_data,order=['Yes','No','No internet service'],ax=axes[0,0])
sb.countplot(x='OnlineBackup',hue='Churn',data=telco_data,order=['Yes','No','No internet service'],ax=axes[0,1])
sb.countplot(x='DeviceProtection',hue='Churn',data=telco_data,order=['Yes','No','No internet service'],ax=axes[1,0])
sb.countplot(x='TechSupport',hue='Churn',data=telco_data,order=['Yes','No','No internet service'],ax=axes[1,1])

In [None]:
cols = ["OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport"]
df = pd.melt(telco_data[telco_data["InternetService"] != "No"][cols]).rename({'value': 'Has service'}, axis=1)
plt.figure(figsize=(10, 4.5))
ax = sb.countplot(data=df, x='variable', hue='Has service')
ax.set(xlabel='Additional service', ylabel='Num of customers')
plt.show()

In [None]:
plt.figure(figsize=(10, 4.5))
df = telco_data[(telco_data.InternetService != "No") & (telco_data.Churn == "Yes")]
df = pd.melt(df[cols]).rename({'value': 'Has service'}, axis=1)
ax = sb.countplot(data=df, x='variable', hue='Has service', hue_order=['No', 'Yes'])
ax.set(xlabel='Additional service', ylabel='Num of churns')
plt.show()

We can find that customers who use network services continue to pay for additional services such as network security, network backup, equipment protection, and technical support, which will effectively reduce the possibility of their loss. 
Product dimension summary: Whether to use network services, and whether to use additional services such as network security, network backup, equipment protection, and technical support when network services are used, has a greater correlation with customer loss. 

2.2 Analysis of personal features 

Personal attributes include gender, whether you are a senior citizen, whether you have families. Based on my personal experience, elder citizen might e the main prolem of customer loss. Personally I think that the cause of the loss of the elderly is due to changes in age and changes in the demand for calls or network services.

Hypothesis: Older people are more prone to loss than non-elderly people.

In [None]:
ax=sb.countplot(x='SeniorCitizen',hue='Churn',data=telco_data,order=[0,1])
ax.set_xticklabels(['Not senior','Senior'])

The above figure confirms our hypothesis that the attrition rate of the elderly group is significantly higher than that of the general group. Therefore, it can be judged whether the customer is an elderly person and the possibility of loss is related.

2.3 Analysis of User Behavior

From the perspective of user behavior,the indicators that might be related to customer loss are: contract period, used life, payment method, monthly consumption, total consumption. Among them, the contract period, service life, and total consumption can better reflect a customer's loyalty to telco service. The user's payment method and monthly consumption situation can show the user's consumption concept, which will also have a certain influence on whether to lose.

In [None]:
corr=telco_data[['tenure','MonthlyCharges','TotalCharges',]].corr()
corr

In [None]:
sb.countplot(x='Contract',hue='Churn',data=telco_data,order=['Month-to-month','One year','Two year'])

 The longer the signed contract period, the lower the loss rate.

In [None]:
fig,axes = plt.subplots(1,2,figsize=(10,6))
ax1=sb.violinplot(x='Churn', y='tenure', data=telco_data,ax=axes[0])
ax1.set_xticklabels(['Not tenure','Tenure'])
ax2=sb.boxplot(x='Churn', y='tenure', data=telco_data,ax=axes[1])
ax2.set_xticklabels(['Not tenure','Tenure'])

If the user's service life is less than 15 months, the company are more likely to lose customers. If the service life is more than 30 months, the company are more likely to keep theor customers. In general, the longer the service life of the customer, the company is less likely to lost customers. 

In [None]:
fig,axes = plt.subplots(1,2,figsize=(10,6))
ax1=sb.violinplot(x='Churn', y='MonthlyCharges', data=telco_data,ax=axes[0])
ax1.set_xticklabels(['Not Monthlycharging','Monthlycharging'])
ax2=sb.boxplot(x='Churn', y='MonthlyCharges', data=telco_data,ax=axes[1])
ax2.set_xticklabels(['Not Monthlycharging','Monthlycharging'])

From teh aspect of monthly consumption, the overall level of monthly consumption of lost users is higher than that of non-churned user groups

In [None]:
plt.figure(figsize=(10, 6))
sb.countplot(x='PaymentMethod',hue='Churn',data=telco_data,order=['Electronic check','Mailed check','Bank transfer (automatic)','Credit card (automatic)'])

In [None]:
sb.catplot(x="PaymentMethod", y="MonthlyCharges", hue="Churn", kind="box", data=telco_data,height=4, aspect=3)

From the aspect of payment methods, the churn rate of customers using electronic checks is significantly higher than that of customers using other payment methods. This is an unexpected conclusion.

# 3. Feature Engineering

In [None]:
SeniorCitizen=list(telco_data['SeniorCitizen'])
Churn=list(telco_data['Churn'])
for i in range(telco_data.shape[0]):
    if Churn[i]=='Yes':
        Churn[i] = 1
    else :
        Churn[i] = 0

In [None]:
Contract=telco_data['Contract']
Contract_dummies=pd.get_dummies(Contract)

InternetService=telco_data['InternetService']
InternetService_dummies=pd.get_dummies(InternetService)

tenure=list(telco_data['tenure'])
tenure_cats=pd.qcut(tenure,6)
tenure_dummies=pd.get_dummies(tenure_cats)

MonthlyCharges=list(telco_data['MonthlyCharges'])
MonthlyCharges_cats=pd.qcut(MonthlyCharges,5)
MonthlyCharges_dummies=pd.get_dummies(MonthlyCharges_cats)

In [None]:
PaymentMethod=list(telco_data['PaymentMethod'])
OnlineSecurity=list(telco_data['OnlineSecurity'])
OnlineBackup=list(telco_data['OnlineBackup'])
DeviceProtection=list(telco_data['DeviceProtection'])
TechSupport=list(telco_data['TechSupport'])
for i in range(telco_data.shape[0]):
    if PaymentMethod[i]=='Electronic check':
        PaymentMethod[i] = 1
    else :
        PaymentMethod[i] = 0
        
    if OnlineSecurity[i]=='Yes':
        OnlineSecurity[i] = 1
    else :
        OnlineSecurity[i] = 0

    if OnlineBackup[i]=='Yes':
        OnlineBackup[i] = 1
    else :
        OnlineBackup[i] = 0

    if DeviceProtection[i]=='Yes':
        DeviceProtection[i] = 1
    else :
        DeviceProtection[i] = 0

    if TechSupport[i]=='Yes':
        TechSupport[i] = 1
    else :
        TechSupport[i] = 0

In [None]:
Churn_y=np.array(Churn).reshape(-1,1)
SeniorCitizen_x=np.array(SeniorCitizen).reshape(-1,1)
PaymentMethod_x=np.array(PaymentMethod).reshape(-1,1)
OnlineSecurity_x=np.array(OnlineSecurity).reshape(-1,1)
OnlineBackup_x=np.array(OnlineBackup).reshape(-1,1)
DeviceProtection_x=np.array(DeviceProtection).reshape(-1,1)
TechSupport_x=np.array(TechSupport).reshape(-1,1)

In [None]:
Contract_x=Contract_dummies.values
InternetService_x=InternetService_dummies.values
tenure_x=tenure_dummies.values
MonthlyCharges_x=MonthlyCharges_dummies.values

In [None]:
X=np.concatenate([SeniorCitizen_x,Contract_x,InternetService_x,PaymentMethod_x,OnlineSecurity_x,OnlineBackup_x,DeviceProtection_x,TechSupport_x,tenure_x,MonthlyCharges_x],axis=1)

In [None]:
col1 = np.array(["SeniorCitizen","Contract_Month_to_Month","Contract_One_Year","Contract_Two_Year","InternetService_DSL","InternetService_Fiber_Optic","IternetService_No","PaymentMethod","OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport","tenure_0,4","tenure_4,14","tenure_14,29","tenure_29,47","tenure_47,64","tenure_64,72","MonthlyCharges_1","MonthlyCharges_2","MonthlyCharges_3","MonthlyCharges_4","MonthlyCharges_5"])

In [None]:
data_frame = pd.DataFrame(data=X,columns=col1)

In [None]:
data_frame

# 4. Decision Tree Model and Results

In [None]:
X=np.array(data_frame.values[:,1:])
y=Churn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
tree = DecisionTreeClassifier(max_depth=6,random_state=0)
tree.fit(x_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
#f1_score
y_predict = tree.predict(x_test)
acc = accuracy_score(y_test,y_predict)
f1 = f1_score(y_test,y_predict,average='weighted')
#precision
p = precision_score(y_test,y_predict,average='weighted')
print('accuracy score is ' + str(acc))
print('f1 score is ' + str(f1))
print('precision score is ' + str(p))

In [None]:
import matplotlib.pyplot as plt
def plot_feature_importance(model):
    n_features = data_frame.shape[1]
    plt.barh(range(n_features-1),model.feature_importances_,align='center')
    plt.yticks(range(n_features-1),data_frame.columns[1:])
    plt.xlabel('Features importance')
    plt.ylabel('feature')
plt.figure(figsize = (10,10))
plot_feature_importance(tree)
plt.show()

The indicators that have a greater impact on the model decision include: whether the contract period is'month to month', whether the customer is a user of the optical fiber network, whether the customer has used the services for less than 4 months and etc.

In [None]:
from sklearn.tree import export_graphviz
export_graphviz(tree,out_file='te_tree.dot',class_names=['Churn_no','Churn_yes'],feature_names=data_frame.columns[1:],impurity=False,filled=True)

import graphviz
with open("te_tree.dot") as f:
    dot_graph=f.read()
graph=graphviz.Source(dot_graph)
graph.render("tree")

Based on the results of telco customers churn, I can give the following suggestions:

1. The company need to increase their efforts to promote one-year or two-year contracts, such as providing regular preferential policies for customers with long-term contracts.

2. The company need to pay attention to the current problems of optical fiber network service and make improvements in time.

3. For customers with a useful life of less than 15 months，the company need to continue to invest in the operation strategy of retention.

4. The Company should pay attention to the needs of the elderly in a timely manner and make timely adjustments to their operation methods according to their needs