In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Importing relevant libraries

In [None]:
!pip3 install pycaret[full]
!pip3 install scikit-learn==0.23.2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from imblearn.over_sampling import SMOTE
from pycaret.classification import * 

In [None]:
test_df = pd.read_csv("/kaggle/input/telecom-churn-datasets/churn-bigml-20.csv")
customer_df = pd.read_csv("/kaggle/input/telecom-churn-datasets/churn-bigml-80.csv")

In [None]:
customer_df.head()

# 2. Exploratory Data Analysis

In [None]:
customer_df.info()

In [None]:
customer_df.describe()

## 2.1 Seperating categorical and numerical variables

In [None]:
customer_df.shape

In [None]:
customer_df.columns

In [None]:
customer_df.head()

Some bit of manual sorting has to be done here as certain categories are actually represented by numbers

In [None]:
numeric_columns = ["Account length", "Number vmail messages", "Total day minutes", "Total day calls", "Total day charge", "Total eve minutes", "Total eve calls", "Total eve charge", "Total night minutes", "Total night calls", "Total night charge", "Total intl minutes", "Total intl calls", "Total intl charge", "Customer service calls"]
categorical_columns = ["State", "Area code", "International plan", "Voice mail plan", "Churn"]
target = ["Churn"]

numeric_df = customer_df[numeric_columns]
test_numeric_df = test_df[numeric_columns]

categorical_df = customer_df[categorical_columns]
test_categorical_df = test_df[categorical_columns]

target_df = customer_df[target]
test_target_df = test_df[target]

## 2.2 Distribution of target variable

In [None]:
customer_df["Churn"].value_counts()

We see quite a large disparity between our target variable. This can be an issue later on. Common ways to mitigate this issue is to look at different evaluation metrics, or perform some form of oversamping/undersamping of minority and majority class respectively

In [None]:
# Just making sure we split the columns correctly
len(numeric_columns) + len(categorical_columns) + len(target)

## 2.3 Correlation of numeric variables

In [None]:
plt.figure(figsize=(16, 6))
heatmap = sns.heatmap(numeric_df.corr(), vmin=-1, vmax=1, annot=True)
heatmap.set_title('Correlation of numeric attributes', fontdict={'fontsize':12}, pad=12);

We can see that there are certain attributes with correlation of 1. Total minutes and total charge is perfectly correlated, regardless whether it is day calls, evening calls, night calls. or international calls.<br> We can immediately drop these attributes.

## 2.4 Distribution of numeric variables

In [None]:
print ("Histogram for Numerical Variables")
plots = numeric_df.hist(figsize=(15,15))

In [None]:
sns.pairplot(numeric_df)
plt.show()

## 2.5 Distribution of categorical variables

In [None]:
categorical_df.shape

In [None]:
categorical_df.head()

In [None]:
len(categorical_df["State"].value_counts())

In [None]:
temp_df = categorical_df[["Area code", "International plan", "Voice mail plan"]]
temp_df.columns = temp_df.columns.str.replace(' ', '')
sns.set(style="darkgrid")
fig, ax =plt.subplots(3,1)
fig = plt.figure(figsize=(8, 6))
max_count = max([max(temp_df[i].value_counts()) for i in temp_df.columns])
Area_code=sns.countplot(y=temp_df['Areacode'],ax=ax[0],order=temp_df.Areacode.value_counts().iloc[:2].index)
International_plan=sns.countplot(y=temp_df['Internationalplan'],ax=ax[1],order=temp_df.Internationalplan.value_counts().iloc[:2].index)
Voice_mail_plan=sns.countplot(y=temp_df['Voicemailplan'],ax=ax[2],order=temp_df.Voicemailplan.value_counts().iloc[:2].index)
ax[0].set_xlim(0,max_count)
ax[1].set_xlim(0,max_count)
ax[2].set_xlim(0,max_count)
Area_code.set(xticklabels=[])
International_plan.set(xticklabels=[])

# 3. Preprocessing

## 3.1 Removing multicollinearlity

In [None]:
numeric_df.drop(columns=["Total day minutes", "Total eve minutes", "Total night minutes", "Total intl minutes"], axis=1, inplace=True)
test_numeric_df.drop(columns=["Total day minutes", "Total eve minutes", "Total night minutes", "Total intl minutes"], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(16, 6))
heatmap = sns.heatmap(numeric_df.corr(), vmin=-1, vmax=1, annot=True)
heatmap.set_title('Correlation of numeric attributes', fontdict={'fontsize':12}, pad=12);

In [None]:
numeric_df.head()
# test_numeric_df.head()

## 3.2 Scaling numeric attributes

In [None]:
scaler = StandardScaler()
numeric_df = pd.DataFrame(scaler.fit_transform(numeric_df) , columns = numeric_df.columns)
test_numeric_df = pd.DataFrame(scaler.transform(test_numeric_df) , columns = test_numeric_df.columns)

In [None]:
numeric_df.head()

## 3.3 One-hot encoding nominal attributes

In [None]:
categorical_df.head()

In [None]:
list(categorical_df.columns)

In [None]:
new_category_df = categorical_df.copy(deep=True)
new_category_df.head()

new_test_category_df = test_categorical_df.copy(deep=True)

In [None]:
nominal_categories =  ['Area code', 'State']
for category in nominal_categories:
    nominal_column = categorical_df[category]
    nominal_column = nominal_column.reset_index(drop=True)
    dummy_columns = pd.get_dummies(nominal_column.astype(str))
    new_category_df.drop(columns=[category], axis=1, inplace=True)
    new_category_df = new_category_df.reset_index(drop=True)
    new_category_df = pd.concat([new_category_df, dummy_columns], axis=1)
    
for category in nominal_categories:
    nominal_column = test_categorical_df[category]
    nominal_column = nominal_column.reset_index(drop=True)
    dummy_columns = pd.get_dummies(nominal_column.astype(str))
    new_test_category_df.drop(columns=[category], axis=1, inplace=True)
    new_test_category_df = new_test_category_df.reset_index(drop=True)
    new_test_category_df = pd.concat([new_test_category_df, dummy_columns], axis=1)

In [None]:
new_category_df.head()

In [None]:
new_test_category_df.head()

## 3.4 Label encoding ordinal values

In [None]:
intl_plan = {"International plan":{"No": 0, "Yes": 1}}
new_category_df = new_category_df.replace(intl_plan)
new_test_category_df = new_test_category_df.replace(intl_plan)
             
vm_plan = {"Voice mail plan":{"No": 0, "Yes": 1}}
new_category_df = new_category_df.replace(vm_plan)
new_test_category_df = new_test_category_df.replace(vm_plan)

churn_cleanup = {"Churn":{False: 0, True: 1}}
new_category_df = new_category_df.replace(churn_cleanup)
new_test_category_df = new_test_category_df.replace(churn_cleanup)

In [None]:
new_category_df

In [None]:
new_test_category_df

## 3.5 Combining numeric and categorical columns

In [None]:
new_category_df = new_category_df.reset_index(drop=True)
new_test_category_df = new_test_category_df.reset_index(drop=True)

In [None]:
combined_df = pd.concat([numeric_df, new_category_df], axis=1)
combined_test_df = pd.concat([test_numeric_df, new_test_category_df], axis=1)

In [None]:
combined_df.head()

In [None]:
combined_test_df.head()

# 4. Dimensionality reduction using PCA

## 4.1 Creating Pipeline

In [None]:
steps = [('pca', PCA()), ('m', LogisticRegression())]
model = Pipeline(steps=steps)

## 4.2 Benchmarking with Logistic Regression

In [None]:
models = {}
for i in range(1,60):
    steps = [('pca', PCA(n_components=i)), ('lr', LogisticRegression())]
    models[str(i)] = Pipeline(steps=steps)

In [None]:
x_columns = combined_df.columns.tolist()
x_columns.remove('Churn')

In [None]:
results = []
names = []
for name, model in models.items():
    model.fit(combined_df[x_columns],combined_df["Churn"])
    y_pred = model.predict(combined_test_df[x_columns])
    f1 = f1_score(combined_test_df["Churn"], y_pred)
    accuracy = accuracy_score(combined_test_df["Churn"], y_pred)
    results.append(f1)
    names.append(name)
    print(f"{name} components: F1 score ->{f1}, accuracy score ->{accuracy}")

We see F1 score perform really poorly due to the class imbalance in our dataset

# 5. SMOTE for class balancing

In [None]:
combined_df["Churn"].value_counts()

In [None]:
oversample = SMOTE()
x, y = oversample.fit_resample(combined_df[x_columns], combined_df["Churn"])

In [None]:
x

In [None]:
y.value_counts()

# 6. Dimensionality reduction with PCA

## 6.1 Benchmarking with Logsitic Regression

In [None]:
result_dict = {}
for name, model in models.items():
    model.fit(x,y)
    y_pred = model.predict(combined_test_df[x_columns])
    f1 = f1_score(combined_test_df["Churn"], y_pred)
    accuracy = accuracy_score(combined_test_df["Churn"], y_pred)
    recall = recall_score(combined_test_df["Churn"], y_pred)
    result_dict[name] = f1
    print(f"{name} components: F1 score ->{f1}, accuracy score ->{accuracy}, recall score-> {recall}")

In [None]:
results_ranking = {k: v for k, v in sorted(result_dict.items(), key=lambda item: item[1], reverse=True)}

In [None]:
results_ranking

We see the optical number of components is 21 components

In [None]:
pca = PCA(n_components=21)
x_train_new = pca.fit_transform(x[x_columns])
x_train_new

## 6.2 Explained variance

In [None]:
sum(pca.explained_variance_ratio_)

In [None]:
len(x_train_new)

With 21 components, we have 95% of explained variance. Cool!

In [None]:
X_test_new = pca.transform(combined_test_df[x_columns])

In [None]:
X_test_new

In [None]:
len(X_test_new)

# 7. Comparing models with PyCaret

In [None]:
caret_train_df = pd.DataFrame(x_train_new)
caret_test_df = pd.DataFrame(X_test_new)

In [None]:
caret_train_df["Churn"] = y
caret_test_df["Churn"] = combined_test_df["Churn"]

In [None]:
caret_train_df

In [None]:
caret_x_columns = caret_train_df.columns.tolist()
caret_x_columns.remove("Churn")

In [None]:
for i in caret_x_columns:
#     caret_train_df[i] = caret_train_df[i].astype('string')
    caret_train_df.rename(columns={i: f"column{i}"}, inplace=True)
    caret_test_df.rename(columns={i: f"column{i}"}, inplace=True)

In [None]:
new_caret_train_columns = caret_train_df.columns.tolist()
new_caret_train_columns.remove("Churn")

In [None]:
for numeric_column in new_caret_train_columns:
    caret_train_df[numeric_column] = caret_train_df[numeric_column].astype(float)
    caret_test_df[numeric_column] = caret_test_df[numeric_column].astype(float)
caret_train_df

In [None]:
caret_test_df

In [None]:
classification_setup = setup(data= caret_train_df, target="Churn", test_data=None, preprocess=False, silent = True)

In [None]:
compare_models()

# 8. LightGBM

## 8.1 Creating model

In [None]:
lgbm_model = create_model('lightgbm')

## 8.2 Tuning model

In [None]:
tuned_lgbm_model = tune_model(lgbm_model)
finalize_model(tuned_lgbm_model)

# 9. Evaluating on unseen data

## 9.1 Getting predictions

In [None]:
predictions = predict_model(tuned_lgbm_model, data = caret_test_df)
predictions

In [None]:
y_pred = predictions["Label"]
y_test = predictions["Churn"]

In [None]:
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

## 9.2 F1 score

In [None]:
print(f"F1 score: {f1}")

## 9.3 Accuracy

In [None]:
print(f"Accuracy score: {accuracy}")