# Chapter 8 - Ex3: Customer Churn Analysis
- Cho dữ liệu WA_Fn-UseC_-Telco-Customer-Churn.csv chứa thông tin khách hàng. Bộ dữ liệu này được dùng để xây dựng mô hình dự đoán một khách hàng kết thúc mối quan hệ hay hủy/không gia hạn (churn) với doanh nghiệp hay không?
- Bộ dữ liệu gồm 7043 mẫu và 21 thuộc tính 
### Yêu cầu:
- Đọc dữ liệu WA_Fn-UseC_-Telco-Customer-Churn.csv, tiền xử lý dữ liệu. 
- Chia dữ liệu thành 2 bộ là train và test theo tỷ lệ 80-20.
- Xem xét tính cân bằng giữa hai loại mẫu ở train. Trực quan hóa. Nhận xét. 
- Nếu 2 loại mẫu ở train này không cân bằng, hãy chọn một phương pháp cân bằng dữ liệu và thực hiện. Trực quan hóa kết quả.

- Tham khảo: [link](https://www.analyticsvidhya.com/blog/2017/03/imbalanced-classification-problem/)
- Và [link](https://www.kaggle.com/blastchar/telco-customer-churn)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("input_data/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [3]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [5]:
# Đếm theo loại: hiếm, phổ biến
occ = data.Churn.value_counts()
occ

No     5174
Yes    1869
Name: Churn, dtype: int64

In [6]:
# Print the ratio of fraud cases
print(occ / len(data.index))

No     0.73463
Yes    0.26537
Name: Churn, dtype: float64


In [7]:
X = data.drop(["customerID", "Churn"], axis=1)
y = data.Churn

In [8]:
X.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65


In [9]:
X.MonthlyCharges = X.MonthlyCharges.astype('float')

In [10]:
X.TotalCharges = pd.to_numeric(X.TotalCharges, errors='coerce')

In [11]:
y.head()

0     No
1     No
2    Yes
3     No
4    Yes
Name: Churn, dtype: object

In [12]:
# Chuẩn hóa y
# No: 0, Yes: 1
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_new = label_encoder.fit_transform(y)
y_new[:5]

array([0, 0, 1, 0, 1])

In [13]:
# Chuẩn hóa X; các cột là Category
# Categorical boolean mask
categorical_feature_mask = X.dtypes==object
# filter categorical columns using mask and turn it into a list
categorical_cols = X.columns[categorical_feature_mask].tolist()
categorical_cols

['gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']

In [14]:
X_new = pd.get_dummies(data=X, columns=categorical_cols, drop_first=True)

In [15]:
X_new.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,1,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
1,0,34,56.95,1889.5,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0,2,53.85,108.15,1,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
3,0,45,42.3,1840.75,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
4,0,2,70.7,151.65,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0


In [16]:
# Chia dữ liệu thành 2 bộ theo tỷ lệ 80:20
from sklearn.model_selection import train_test_split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, 
                                                    test_size = 0.2)

In [18]:
#X_train.info()

In [19]:
from collections import Counter
sorted(Counter(y_train).items())

[(0, 4159), (1, 1475)]

In [20]:
# Vì lượng dữ liệu của mỗi lớp đều khá nhiều và theo tỷ lệ 3:1
# Có thể cho undersampling hoặc oversampling

In [21]:
# undersampling
from imblearn.under_sampling import RandomUnderSampler

In [22]:
X_train = X_train.fillna(X_train.mean())
X_train.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
3332,0,1,46.3,46.3,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
5388,0,1,19.9,19.9,1,1,1,1,0,0,...,1,0,1,0,0,0,0,0,0,1
5743,1,38,20.2,735.9,0,0,0,1,0,0,...,1,0,1,0,0,1,0,0,0,1
2528,0,8,80.1,679.3,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0
453,0,28,81.05,2227.1,0,1,1,1,0,0,...,0,0,0,1,0,0,1,0,1,0


In [23]:
rs = RandomUnderSampler()
X_train_resampled, y_train_resampled = rs.fit_resample(X_train, y_train)

In [24]:
sorted(Counter(y_train_resampled).items())

[(0, 1475), (1, 1475)]

In [25]:
# Oversampling
from imblearn.over_sampling import SMOTE

In [26]:
X_train_S, y_train_S = SMOTE().fit_resample(X_train, y_train)

In [27]:
sorted(Counter(y_train_S).items())

[(0, 4159), (1, 4159)]