In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 背景：关于用户留存有这样一个观点，如果将用户流失率降低5%，公司利润将提升25%-85%。如今高居不下的获客成本让电信运营商遭遇“天花板”，甚至陷入获客难的窘境。随着市场饱和度上升，电信运营商亟待解决增加用户黏性，延长用户生命周期的问题。因此，电信用户流失分析与预测至关重要。

## 提出问题：
## 1. 用户特征与流失之间的关系。
## 2. 从整体来看，流失的用户普遍具有哪些特征？
## * 3. 尝试找到合适的模型预测流失用户。
## 4. 针对性给出增加用户黏性、预防流失的建议。


## 数据集总计7043位客户的数据（行），每位客户的数据由21个特征描述（列）。其中，最后一列表征该客户是否流失（Churn），Yes表征流失。其余特征有客户个人信息，账户信息与注册的服务组成。

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Load Data

## 导入数据集

In [None]:
CustomerDF = pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv', header=0)

# 2. Overview

In [None]:
CustomerDF.head(5)

In [None]:
CustomerDF.shape

In [None]:
CustomerDF.info()

### No nan data

In [None]:
CustomerDF.describe(include='all').T

# 3. Data Manipulation

TotalCharges 列含有空白值，无法直接转换

In [None]:
# CustomerDF['TotalCharges'] = CustomerDF['TotalCharges'].astype('float')

In [None]:
CustomerDF['TotalCharges'] = CustomerDF['TotalCharges'].replace(' ', np.nan)

In [None]:
CustomerDF['TotalCharges'].isna().sum()
CustomerDF[CustomerDF['TotalCharges'].isna()]

### 经过观察，发现这11个用户‘tenure’（入网时长）为0个月，推测是当月新入网用户。根据一般经验，用户即使在注册的当月流失，也需缴纳当月费用。因此将这11个用户入网时长改为1，将总消费额填充为月消费额，符合实际情况。

In [None]:
CustomerDF.loc[CustomerDF['TotalCharges'].isna(), 'tenure'] = 1
CustomerDF[CustomerDF['TotalCharges'].isna()]

In [None]:
CustomerDF['TotalCharges'].replace(np.nan, CustomerDF['MonthlyCharges'], inplace=True)

In [None]:
CustomerDF[CustomerDF['TotalCharges'].isna()]

### 再将TotalCharges转换为float

In [None]:
CustomerDF['TotalCharges'] = CustomerDF['TotalCharges'].astype('float')
CustomerDF.info()

### 根据一般经验，将用户特征划分为用户属性、服务属性、合同属性，并从这三个维度进行可视化分析。

In [None]:
CustomerDF.describe().T

# 4. Exploratory Data Analysis

In [None]:
fig = plt.figure(figsize=(8, 8))
pic, label, text = plt.pie(CustomerDF['Churn'].value_counts(), labels=CustomerDF['Churn'].value_counts().index, autopct='%1.2f%%', explode=(0.1,0))
for l in label:
    l.set_size(18)
for t in text:
    t.set_size(18)
plt.title('Churn(Yes OR Not)', fontsize=20)
plt.show()

### 上月流失用户占比达26.54%。

In [None]:
ChurnDF = CustomerDF['Churn'].value_counts().to_frame().sort_values(by=['Churn'], ascending=False)


In [None]:
ChurnDF.values

In [None]:
fig = plt.figure(figsize=(8, 8))
sns.barplot(x=ChurnDF.index, y=ChurnDF['Churn'], data=ChurnDF)
plt.title('Churn(Yes OR Not)', fontsize=16)

## 4.1 用户分析

In [None]:
g = (CustomerDF.groupby("SeniorCitizen")["Churn"].value_counts() / len(CustomerDF)).to_frame()
# g.rename(columns={"Churn": "percentage of customers"}, inplace=True)
# g.reset_index(inplace=True)
g.index

In [None]:
def barplot_percentages(feature, orient='v', axis_name="percentage of customers"):
    ratios = pd.DataFrame()
    g = (CustomerDF.groupby(feature)["Churn"].value_counts() / len(CustomerDF)).to_frame()
    g.rename(columns={"Churn": axis_name}, inplace=True)
    g.reset_index(inplace=True)

    #print(g)
    if orient == 'v':
        ax = sns.barplot(x=feature, y= axis_name, hue='Churn', data=g, orient=orient)
        ax.set_yticklabels(['{:,.0%}'.format(y) for y in ax.get_yticks()])
        plt.rcParams.update({'font.size': 13})
        plt.legend(fontsize=12)
    else:
        ax = sns.barplot(x= axis_name, y=feature, hue='Churn', data=g, orient=orient)
        ax.set_xticklabels(['{:,.0%}'.format(x) for x in ax.get_xticks()])
        plt.legend(fontsize=12)
    plt.title('Churn(Yes/No) Ratio as {0}'.format(feature))
    plt.show()

In [None]:
barplot_percentages("SeniorCitizen")
barplot_percentages("gender")

In [None]:
CustomerDF['churn_rate'] = CustomerDF['Churn'].replace("No", 0).replace("Yes", 1)
# sns.barplot(x='gender', y='churn_rate', data=CustomerDF)
CustomerDF['churn_rate'].head(5)

In [None]:
CustomerDF['churn_rate'] = CustomerDF['Churn'].replace("No", 0).replace("Yes", 1)
g = sns.FacetGrid(CustomerDF, col="SeniorCitizen", height=4, aspect=.9)
ax = g.map(sns.barplot, "gender", "churn_rate", palette = "Blues_d", order= ['Female', 'Male'])
plt.rcParams.update({'font.size': 13})
plt.show()

### 小结:用户流失与性别基本无关； 年老用户流失百分比显著高于年轻用户。

In [None]:
gp_partner = (CustomerDF.groupby('Partner')["Churn"].value_counts() / len(CustomerDF)).to_frame()
gp_partner.rename(columns={"Churn": "percentage of customers"}, inplace=True)
gp_partner.reset_index(inplace=True)
gp_partner.head(5)

In [None]:
fig, axis = plt.subplots(1, 2, figsize=(16,8))
axis[0].set_title("Has Partner")
axis[1].set_title("Has Dependents")
axis_y = "percentage of customers"

# Plot Partner column
gp_partner = (CustomerDF.groupby('Partner')["Churn"].value_counts() / len(CustomerDF)).to_frame()
gp_partner.rename(columns={"Churn": axis_y}, inplace=True)
gp_partner.reset_index(inplace=True)
ax1 = sns.barplot(x='Partner', y= axis_y, hue='Churn', data=gp_partner, ax=axis[0])
ax1.legend(fontsize=16)
# ax1.set_xlabel('伴侣') 无中文字库


# Plot Dependents column
gp_dep = (CustomerDF.groupby('Dependents')["Churn"].value_counts() / len(CustomerDF)).to_frame()
gp_dep.rename(columns={"Churn": axis_y} , inplace=True)
gp_dep.reset_index(inplace=True)
ax2 = sns.barplot(x='Dependents', y= axis_y, hue='Churn', data=gp_dep, ax=axis[1])
ax2.legend(fontsize=16)
# ax2.set_xlabel('子女')

# 设置字体大小
plt.rcParams.update({'font.size': 20})

plt.show()

In [None]:
# Kernel density estimaton核密度估计
def kdeplot(feature,xlabel):
    plt.figure(figsize=(9, 4))
    plt.title("KDE for {0}".format(feature))
    ax0 = sns.kdeplot(CustomerDF[CustomerDF['Churn'] == 'No'][feature].dropna(), color= 'navy', label= 'Churn: No', shade='True') # 一维核密度估计，非参数估计，概率密度函数
    ax1 = sns.kdeplot(CustomerDF[CustomerDF['Churn'] == 'Yes'][feature].dropna(), color= 'orange', label= 'Churn: Yes', shade='True')
    plt.xlabel(xlabel)
    #设置字体大小
    plt.rcParams.update({'font.size': 20})
    plt.legend(fontsize=16)
    plt.show()
    
kdeplot('tenure','tenure')

### 小结:
#### 1. 有伴侣的用户流失占比低于无伴侣用户；
#### 2. 有家属的用户流失占比低于无家属用户;
#### 3. 在网时长越久，流失率越低，符合一般经验；
#### 4. 在网时间达到三个月，流失率小于在网率，证明用户心理稳定期一般是三个月。

## 4.2 服务属性分析

In [None]:
plt.figure(figsize=(10, 6))
barplot_percentages("MultipleLines", orient='h')

In [None]:
plt.figure(figsize=(10, 6))
barplot_percentages("InternetService", orient="h")

In [None]:
cols = ["PhoneService","MultipleLines","OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies"]

In [None]:
df1 = pd.melt(CustomerDF[CustomerDF["InternetService"] != "No"][cols])
df1

In [None]:
plt.figure(figsize=(30, 10))
df1 = CustomerDF[(CustomerDF['InternetService'] != "No") & (CustomerDF['Churn'] == "Yes")]
df1 = pd.melt(df1[cols])
df1.rename(columns={'value': 'Has Service'}, inplace=True)
ax = sns.countplot(data=df1, x='variable', hue='Has Service', hue_order=['No', 'Yes'])
ax.set(xlabel='Internet Additional service', ylabel='Churn Num')
plt.rcParams.update({'font.size':20})
plt.legend( labels = ['No Service', 'Has Service'], fontsize=15)
plt.title('Num of Churn Customers as Internet Additional Service')
plt.show()

### 小结:
#### 1. 电话服务整体对用户流失影响较小。
#### 2. 单光纤用户的流失占比较高；
#### 3. 光纤用户绑定了安全、备份、保护、技术支持服务的流失率较低；
#### 4. 光纤用户附加流媒体电视、电影服务的流失率占比较高。

## 4.3 合同属性分析

In [None]:
plt.figure(figsize=(10, 6))
barplot_percentages("PaymentMethod", orient='h')

In [None]:
g = sns.FacetGrid(CustomerDF, col="PaperlessBilling", height=6, aspect=.9)
ax = g.map(sns.barplot, "Contract", "churn_rate", palette = "Blues_d", order= ['Month-to-month', 'One year', 'Two year'])
plt.rcParams.update({'font.size':12})
plt.show()

In [None]:
kdeplot('MonthlyCharges','MonthlyCharges')
kdeplot('TotalCharges','TotalCharges')
plt.show()

### 小结:
#### 1. 采用电子支票支付的用户流失率最高，推测该方式的使用体验较为一般；
#### 2. 签订合同方式对客户流失率影响为：按月签订 > 按一年签订 > 按两年签订，证明长期合同最能保留客户；
#### 3.月消费额大约在70-110之间用户流失率较高,；
#### 4. 长期来看，用户总消费越高，流失率越低，符合一般经验。

# 5. Churn Prediction

#### 对数据集进一步清洗和提取特征，通过特征选取对数据进行降维，采用机器学习模型应用于测试数据集，然后对构建的分类模型准确性进行分析

## 5.1 数据清洗

In [None]:
CustomerID = CustomerDF['customerID']
CustomerDF.drop(['customerID'], axis=1, inplace=True) # 无需customerID作为特征

#### 观察数据类型，发现大多除了“tenure”、“MonthlyCharges”、“TotalCharges”是连续特征，其它都是离散特征。对于连续特征，采用标准化方式处理。对于离散特征，特征之间没有大小关系，采用one-hot编码；特征之间有大小关联，则采用数值映射。

### a. 获取离散特征

In [None]:
cateCols = [c for c in CustomerDF.columns if CustomerDF[c].dtype == 'object' or c == 'SeniorCitizen']
dfCate = CustomerDF[cateCols].copy()
dfCate.head(3)

### b. 进行特征编码

In [None]:
for col in cateCols:
    if dfCate[col].nunique() == 2: # 唯一值个数为2时，即Yes Or No
        dfCate[col] = pd.factorize(dfCate[col])[0] # 序列化 https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.factorize.html?highlight=factorize
    else:
        dfCate = pd.get_dummies(dfCate, columns=[col]) # One-Hot

# tenure, MonthlyCharges, TotalCharges 数值列无序序列化
dfCate['tenure'] = CustomerDF[['tenure']]
dfCate['MonthlyCharges'] = CustomerDF[['MonthlyCharges']]
dfCate['TotalCharges'] = CustomerDF[['TotalCharges']]

In [None]:
data = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
type(data[['A']])
data.corr()

### c. 查看关联关系

In [None]:
plt.figure(figsize=(16,8))
dfCate.corr()['Churn'].sort_values(ascending=False).plot(kind='bar')
plt.show()

## 5.2 特征选取

In [None]:
# 特征选择
dropFea = ['gender','PhoneService',
           'OnlineSecurity_No internet service', 'OnlineBackup_No internet service',
           'DeviceProtection_No internet service', 'TechSupport_No internet service',
           'StreamingTV_No internet service', 'StreamingMovies_No internet service',
           #'OnlineSecurity_No', 'OnlineBackup_No',
           #'DeviceProtection_No','TechSupport_No',
           #'StreamingTV_No', 'StreamingMovies_No',
           ]
dfCate.drop(dropFea, inplace=True, axis =1) 
#最后一列是作为标识
target = dfCate['Churn'].values
#列表：特征和1个标识
columns = dfCate.columns.tolist()

### a. 构造训练数据集和测试数据集

In [None]:
# 列表：特征
columns.remove('Churn')
# 含有特征的DataFrame
features = dfCate[columns].values

In [None]:
columns

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# 30% 作为测试集，其余作为训练集
# random_state = 1表示重复试验随机得到的数据集始终不变
# stratify = target 表示按标识的类别，作为训练数据集、测试数据集内部的分配比例
train_x, test_x, train_y, test_y = train_test_split(features, target, test_size=0.30, stratify = target, random_state = 1)

## 5.3 构建模型

### 构造多个分类器

In [None]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [None]:
# 构造各种分类器
classifiers = [
    SVC(random_state = 1, kernel = 'rbf'),    
    DecisionTreeClassifier(random_state = 1, criterion = 'gini'),
    RandomForestClassifier(random_state = 1, criterion = 'gini'),
    KNeighborsClassifier(metric = 'minkowski'),
    AdaBoostClassifier(random_state = 1),   
]
# 分类器名称
classifier_names = [
            'svc', 
            'decisiontreeclassifier',
            'randomforestclassifier',
            'kneighborsclassifier',
            'adaboostclassifier',
]
# 分类器参数
#注意分类器的参数，字典键的格式，GridSearchCV对调优的参数格式是"分类器名"+"__"+"参数名"
classifier_param_grid = [
            {'svc__C':[0.1], 'svc__gamma':[0.01]},
            {'decisiontreeclassifier__max_depth':[6,9,11]},
            {'randomforestclassifier__n_estimators':range(1,11)} ,
            {'kneighborsclassifier__n_neighbors':[4,6,8]},
            {'adaboostclassifier__n_estimators':[70,80,90]}
]

## 5.4 模型参数调优和评估

### 对分类器进行参数调优和评估，最后得到试用AdaBoostClassifier(n_estimators=70)效果最好

In [None]:
# 对具体的分类器进行 GridSearchCV 参数调优
def GridSearchCV_work(pipeline, train_x, train_y, test_x, test_y, param_grid, score = 'accuracy_score'):
    response = {}
    gridsearch = GridSearchCV(estimator = pipeline, param_grid = param_grid, cv=3, scoring = score)
    # 寻找最优的参数 和最优的准确率分数
    search = gridsearch.fit(train_x, train_y)
    print("GridSearch 最优参数：", search.best_params_)
    print("GridSearch 最优分数： %0.4lf" %search.best_score_)
    #采用predict函数（特征是测试数据集）来预测标识，预测使用的参数是上一步得到的最优参数
    predict_y = gridsearch.predict(test_x)
    print(" 准确率 %0.4lf" %accuracy_score(test_y, predict_y))
    response['predict_y'] = predict_y
    response['accuracy_score'] = accuracy_score(test_y, predict_y)
    return response

In [None]:
for model, model_name, model_param_grid in zip(classifiers, classifier_names, classifier_param_grid):
    #采用 StandardScaler 方法对数据规范化：均值为0，方差为1的正态分布
    pipeline = Pipeline([
            #('scaler', StandardScaler()),
            #('pca',PCA),
            (model_name, model)
    ])
    result = GridSearchCV_work(pipeline, train_x, train_y, test_x, test_y, model_param_grid , score = 'accuracy')

# 6 预测客户是否流失

#### 现假定有一客户， 其特征为非老年客户、有伴侣与子女、无电话服务、DSL互联网服务、仅使用互联网备份服务、合同为月付费、无纸化账单、电子支票支付、入网时长1个月、月消费29.85、总消费29.85，尝试通过模型预测该客户是否会流失？

#### 即对应字段：
#### 'SeniorCitizen'=0, 'Partner'=1, 'Dependents'=0, 'PaperlessBilling'=1, 'MultipleLines_No'=0, 'MultipleLines_No phone service'=1, 'MultipleLines_Yes'=0,
#### 'InternetService_DSL'=1, 'InternetService_Fiber optic'=0, 'InternetService_No'=0, 'OnlineSecurity_No'=0, 'OnlineSecurity_Yes'=0, 'OnlineBackup_No'=0,
#### 'OnlineBackup_Yes'=1, 'DeviceProtection_No'=0, 'DeviceProtection_Yes'=0, 'TechSupport_No'=0, 'TechSupport_Yes'=0, 'StreamingTV_No'=0, 'StreamingTV_Yes'=0,
#### 'StreamingMovies_No'=0, 'StreamingMovies_Yes'=0, 'Contract_Month-to-month'=1, 'Contract_One year'=0, 'Contract_Two year'=0,
#### 'PaymentMethod_Bank transfer (automatic)'=0, 'PaymentMethod_Credit card (automatic)'=0, 'PaymentMethod_Electronic check'=1, 'PaymentMethod_Mailed check'=0,
#### 'tenure'=1, 'MonthlyCharges'=29.85, 'TotalCharges'=29.85

In [None]:
AdaBC = AdaBoostClassifier(random_state=1, n_estimators=70)
AdaBC.fit(train_x, train_y)

In [None]:
train_x[0:1, :]

# 结论和建议
#### 根据以上分析，得到高流失率用户的特征：
#### 用户属性：老年用户，未婚用户，无亲属用户更容易流失； 服务属性：在网时长小于半年，有电话服务，光纤用户/光纤用户附加流媒体电视、电影服务，无互联网增值服务； 合同属性：签订的合同期较短，采用电子支票支付，是电子账单，月租费约70-110元的客户容易流失； 其它属性对用户流失影响较小，以上特征保持独立。 针对上述结论，从业务角度给出相应建议：

#### 根据预测模型，构建一个高流失率的用户列表。通过用户调研推出一个最小可行化产品功能，并邀请种子用户进行试用。 用户方面：针对老年用户、无亲属、无伴侣用户的特征退出定制服务如亲属套餐、温暖套餐等，一方面加强与其它用户关联度，另一方对特定用户提供个性化服务。 服务方面：针对新注册用户，推送半年优惠如赠送消费券，以渡过用户流失高峰期。针对光纤用户和附加流媒体电视、电影服务用户，重点在于提升网络体验、增值服务体验，一方面推动技术部门提升网络指标，另一方面对用户承诺免费网络升级和赠送电视、电影等包月服务以提升用户黏性。针对在线安全、在线备份、设备保护、技术支持等增值服务，应重点对用户进行推广介绍，如首月/半年免费体验。 合同方面：针对单月合同用户，建议推出年合同付费折扣活动，将月合同用户转化为年合同用户，提高用户在网时长，以达到更高的用户留存。 针对采用电子支票支付用户，建议定向推送其它支付方式的优惠券，引导用户改变支付方式。