# Intro:

**Final Project:**

    Telecome Operator Interconnect would like to forecast their churn customers. If the customer is a churn customer they
    would like to offer promotional codes and special plan options in order to retain.
    
    services provided:
    -Landline communication. The telephone can be connected to several lines simultaneously.
    -Internet. The network can be set up via a telephone line (DSL, digital subscriber line) or through a fiber optic cable.
       
       
    Some other services the company provides include:
        -Internet security: antivirus software (DeviceProtection) and a malicious website blocker (OnlineSecurity)
        -A dedicated technical support line (TechSupport)
        -Cloud file storage and data backup (OnlineBackup)
        -TV streaming (StreamingTV) and a movie directory (StreamingMovies)
    
    The clients can choose either a monthly payment or sign a 1- or 2-year contract.
    They can use various payment methods and receive an electronic invoice after a transaction.
    
 **Data Description:**
 
     4 files were obtain from different sources to provide customer data
         -contract.csv — contract information Valid as of February 1, 2020
         -personal.csv — the client's personal data
         -internet.csv — information about Internet services
         -phone.csv — information about telephone services
        In each file, the column customerID contains a unique code assigned to each client.

**Goal:** Develop a model to predict user churn

# Data Preprocessing

In [2]:
#import libraries that I may need for project
import pandas as pd
import numpy as np
from datetime import datetime
from statsmodels.tsa.seasonal import seasonal_decompose

#visualization libraries
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.subplots import make_subplots

#sklearn
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, accuracy_score, ConfusionMatrixDisplay, auc, roc_curve
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.utils import shuffle, resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import resample


#gradient boosting
import lightgbm as lgb
import xgboost as xgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

#turn off warnings
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'pandas'

In [2]:
#load data
df_contract = pd.read_csv("contract.csv")
df_personal = pd.read_csv("personal.csv")
df_internet = pd.read_csv("internet.csv")
df_phone = pd.read_csv("phone.csv")

In [None]:
df_contract['BeginDate'].value_counts().sort_values(ascending=False)

In [None]:
df_internet.info()

In [None]:
df_internet.head()

In [None]:
df_personal['customerID'].value_counts()

In [None]:
df_personal.head()

In [None]:
df_phone.info()

In [None]:
df_phone.head()

In [None]:
#merge all dataframes together

#merge contract and personal
df = df_contract.merge(df_personal, on= 'customerID')

#internet merge
df= df.merge(df_internet, how='outer', on= 'customerID')

#phone merge
df = df.merge(df_phone, how='outer', on = 'customerID')

df.info()

After merging all of the dataframes together, it left the new dataframe with missing values for the specific services that were not selected by certain members...
For the moment, I will fill the missing values with "not signed up".

In [None]:
#fill missing rows post-merge
df = df.fillna('Not signed up for other services')

df.info()

In [None]:
#rename columns
df = df.rename(columns = {'customerID':'customer_id', 'BeginDate':'begin_date', 'EndDate':'end_date', 'Type':'type', 'PaperlessBilling':'paperless_billing', 'PaymentMethod':'payment_method', 'MonthlyCharges':'monthly_charges', 'TotalCharges':'total_charges', 'SeniorCitizen':'senior_citizen', 'Partner':'partner', 'Dependents':'dependents', 'InternetService':'internet_service', 'OnlineSecurity':'online_security', 'OnlineBackup':'online_backup', 'DeviceProtection':'device_protection', 'TechSupport':'tech_support', 'StreamingTV':'streaming_tv', 'StreamingMovies':'streaming_movies', 'MultipleLines':'multiple_lines'})

df.info()

In [None]:
#change column data types

#convert 'begin_date' column to datetime 
df['begin_date'] = pd.to_datetime(df['begin_date'])

#convert 'total_charges' column to float64 
display((df['total_charges'] == ' ').sum()) #found 11 rows in the column with empty values. 
df['total_charges'].replace(" ", np.nan, inplace=True) #replace empty values with NaN to prepare to drop these rows
df = df.dropna(subset=['total_charges']) #drop 11 rows with NaN values in 'total_charges' column to convert to float64. 11 rows is much smaller than 5% of total data so its ok to drop these.
df['total_charges'] = df['total_charges'].astype('float64') #convert column to float64 

df.info()

In [14]:
#dropping the customer Id column because it will not needed for the model.
df = df.drop('customer_id', axis=1)

In [None]:
df.head(25)

# EDA

In [None]:
#create a new column based on 'end_date' to say if customer has left or not left the service which will serve as the target
df['customer_churn'] = np.where(df['end_date'] == 'No', 1, 0) # 1 is no churn, 0 is churn

#drop 'end_date' column
df = df.drop(['end_date'], axis=1)

df.head(20)

In [None]:
#visualize the churn of male-female customers and senior citizens
g = sns.catplot(data=df, x="gender", y="customer_churn", hue='senior_citizen', kind="bar")

g.fig.suptitle('Gender and Age Comparison of Customers Churned')
plt.show()

From the plot, it’s evident that while the male-to-female ratio of customers who churned is roughly equal, senior citizens represent a larger proportion of those who churned. We'll explore this further to determine whether senior citizens form a significant portion of the overall customer base.

In [None]:
#visualize distribution of churned customers across internet services
int = df.groupby('internet_service')['customer_churn'].value_counts()
display(int)

sns.catplot(data=df, x='internet_service', hue='customer_churn', kind='count')
plt.title('Count of Customers Churned Across Internet Services')
plt.show()

Customers who subscribed to fiber optic internet service not only exhibited the highest percentage of churn within that category but also accounted for the largest number of churned customers across all three internet service types.

In [None]:

#checking for a class imbalance
classes = len(df[df['customer_churn'] == 1]) / len(df['customer_churn']) * 100
print('Percentage of customers in the dataframe that did not churn:', classes)

labels = ['No churn', 'Churn']
fig, ax = plt.subplots()
ax.pie(df['customer_churn'].value_counts(), labels=labels, autopct='%1.1f%%')
plt.title('Percentage of Customers Churned')
plt.show()

The chart above highlights a class imbalance in the target column, customer_churn. Specifically, the number of loyal customers significantly exceeds those who churned (represented by 0). This imbalance poses a challenge, as any model trained on such data may struggle to accurately predict churn for the minority class.

Solution: Upsampling. This technique involves generating additional synthetic data points for the minority class—in this case, customers who have churned—to balance the dataset and improve model performance

In [None]:
#countplot of senior citizens
sns.set_style('whitegrid')
sns.countplot(x='senior_citizen', data=df)
plt.title('Count of Senior Citizens Amongst All Customers')
plt.show()

In [None]:
labels = ['Non Senior Citizen', 'Senior Citizen']
fig, ax = plt.subplots()
ax.pie(df['senior_citizen'].value_counts(), labels=labels, autopct='%1.1f%%')
plt.title('Percentage of Senior Citizens in Data')
plt.show()

The dataset shows that senior citizen customers number slightly over 1,100, comprising 16.2% of the total data.

Feat. Eng.

In [None]:
df.info()

In [23]:
#create three features from the 'begin_date' column in order to gain more insight on data

#year
df['begin_year'] = df['begin_date'].dt.year

#month
df['begin_month'] = df['begin_date'].dt.month

#day of week
df['begin_day_of_week'] = df['begin_date'].dt.dayofweek

#drop 'begin_date' column from df
df = df.drop({'begin_date'}, axis=1) 

In [None]:
df.info()

In [None]:
# Check 'begin_year' column for relationship to 'customer_churn'
year_df = (
    df.groupby('begin_year')['customer_churn']
    .value_counts()
    .rename('count')  # Rename the column to avoid conflicts
    .reset_index()
)

# Display the processed DataFrame and its info
display(year_df)
display(year_df.info())

# Plot the data
sns.relplot(data=year_df, x='begin_year', y='count', hue='customer_churn', kind='line')
plt.title('Customer Churn over the Years')
plt.show()

Starting in 2013, there is a noticeable increase in the total number of customers leaving Interconnect based on their start date. This trend may be linked to a potential service change in 2018 or 2019, as most current customers are longer-term subscribers. The start dates of 2018 and 2019 recorded the highest number of churned customers, with a significant portion of total churn occurring among customers who began their contracts in 2019.

The end date was not a key factor in this analysis, as the dataset included only four end dates for customers who churned, each occurring with nearly equal frequency.

In [None]:
#check 'begin_month' column for relationship to 'customer churn'
month_df = (
    df.groupby('begin_month')['customer_churn']
    .value_counts()
    .rename('count')  # Rename the column to avoid conflicts
    .reset_index()
)

# Display the processed DataFrame and its info
display(month_df)
display(month_df.info())

# Plot the data
sns.relplot(data=month_df, x='begin_month', y='count', hue='customer_churn', kind='line')
plt.title('Customer Churn over the months')
plt.show()


A significant number of customers who churned had started their contracts in the fourth quarter of the year, particularly in September and the later months.

In [None]:
#check 'begin_day_of_week' column for relationship to 'customer churn'
day_df = (
    df.groupby('begin_day_of_week')['customer_churn']
    .value_counts()
    .rename('count')  # Rename the column to avoid conflicts
    .reset_index()
)

# Display the processed DataFrame and its info
display(day_df)
display(day_df.info())

# Plot the data
sns.relplot(data=day_df, x='begin_day_of_week', y='count', hue='customer_churn', kind='line')
plt.title('Customer Churn by Days of Week')
plt.show()


The majority of customers who are no longer under contract with Interconnect began their service on Thursdays and Saturdays, during the months of September to December, and in the years 2018 and 2019.

In [None]:
#Analyzing additional features
fig = plt.figure(figsize = (15, 10))

ax1 = fig.add_subplot(2,3,1)
sns.countplot(data=df, x='online_security', ax=ax1)
plt.xlabel('Customers Registered for Online Security')

ax2 = fig.add_subplot(2,3,2)
plt.title('Additional Features')
sns.countplot(data=df, x='online_backup', ax=ax2)
plt.xlabel('Customers Registered for Online Backup')

ax3 = fig.add_subplot(2,3,3)
sns.countplot(data=df, x='device_protection', ax=ax3)
plt.xlabel('Customers Registered for Device Protection')

ax4 = fig.add_subplot(2,3,4)
sns.boxplot(data=df, x='begin_month', y='total_charges', ax=ax4)
plt.xlabel('Total Charges by Beginning Month')

ax5 = fig.add_subplot(2,3,5)
sns.boxplot(data=df, x='begin_month', y='monthly_charges', ax=ax5)
plt.xlabel('Monthly Charges by Beginning Month')

ax6 = fig.add_subplot(2,3,6)
sns.boxplot(data=df, x='begin_year', y='monthly_charges', ax=ax6)
plt.xlabel('Monthly Charges by Beginning Year')

plt.show()

While total charges decreased for customers signing up later in the calendar year, median monthly charges remained relatively consistent across all 12 months, except for a noticeable dip in January sign-ups. This could be attributed to discounts offered by the company at the start of the year. Additionally, median monthly charges by contract start year showed little variation, apart from a significant price drop in 2020. This decline might be due to the decrease in customers following the high churn rates in 2018 and 2019 or a strategic decision by the company to lower prices in 2020 to attract new customers in response to the churn.

# Encoding

In [None]:
#one hot encode the 'payment_method', 'paperless_billing', and 'type' columns.
df = pd.get_dummies(df, columns=['payment_method', 'paperless_billing', 'type'])

# Convert the resulting columns to boolean type
dummy_columns = [col for col in df.columns if 'payment_method_' in col or 'paperless_billing_' in col or 'type_' in col]
df[dummy_columns] = df[dummy_columns].astype(bool)

# Check the updated info
df.info()

In [None]:
#label encoding
le = LabelEncoder()
categ = ['gender','senior_citizen', 'partner', 'dependents', 'internet_service', 'online_security', 'online_backup', 'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies', 'multiple_lines']

df[categ] = df[categ].apply(le.fit_transform)

df.sample(30)

# Data Split

In [31]:
#features and targets

features = df.drop('customer_churn', axis=1)
target = df['customer_churn']

In [None]:
#splitting data into training, validation, and test sets at a ratio of 60:20:20

#split training and test sets
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=12345, stratify=target)

#split validation set
features_train, features_valid, target_train, target_valid = train_test_split(features_train, target_train, test_size=0.25, random_state=12345, stratify=target_train)

print(features_train.shape)
print(target_train.shape)
print(features_valid.shape)
print(features_test.shape)

## Feature scale

In [None]:
numeric = ['monthly_charges', 'total_charges', 'begin_year','begin_month', 'begin_day_of_week']

def scaling(features_train, features_valid, features_test):
    scaler = MinMaxScaler()
    scaler.fit(features_train[numeric])
    features_train[numeric] = scaler.transform(features_train[numeric]) 
    features_valid[numeric] = scaler.transform(features_valid[numeric])
    features_test[numeric] = scaler.transform(features_test[numeric])
    return features_train, features_valid, features_test

scaling(features_train, features_valid, features_test)

## upsampling the Data

Class imbalance by upsampling the data

In [34]:
#function to upsample the data
def upsample(features, target, repeat):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]
    
    features_upsampled = pd.concat([features_ones] + [features_zeros]  * repeat)
    target_upsampled = pd.concat([target_ones] + [target_zeros] * repeat)

    features_upsampled, target_upsampled = shuffle(features_upsampled, target_upsampled, random_state=12345)

    return features_upsampled, target_upsampled

In [35]:
#apply upsample function to training set 
upsample_features_train, upsample_target_train = upsample(features_train, target_train, 3)

In [None]:
#re-examine class balances after upsampling
class_balance_upsampled = upsample_target_train.value_counts(normalize=True)
print('Balance of Classes:')
print(class_balance_upsampled)

fig, ax = plt.subplots()

bar_colors = ['tab:red', 'tab:blue']

class_balance_upsampled.plot.bar(ylabel='Percentage of Class', title='Class Imbalances of the Upsampled Target', color=bar_colors)


plt.show()

class imbalance is much closer to at 52% to 47%, enabeling us to move forward with our model training

# MOdel Training

In [37]:
#calculate auc-roc score
def auc_roc(model, features, target):
    probabilities_valid = model.predict_proba(features)
    probabilities_one_valid = probabilities_valid[:, 1]
    auc_roc = roc_auc_score(target, probabilities_one_valid)
    print('AUC - ROC Score:', auc_roc)
    
    fpr, tpr, thresholds = roc_curve(target, probabilities_one_valid)
    
    #calculate accuracy score on training set
    train_predictions = model.predict(features)
    train_accuracy = accuracy_score(target, train_predictions)
    print("Accuracy Score for the Training Set:", train_accuracy)
    
    #plot auc-roc curve
    plt.figure()
    plt.plot(fpr, tpr)
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve')
    plt.show()
    return

In [None]:
#initialize dummy model
dummy_clf = DummyClassifier(strategy='uniform') #strategy is uniform because data is now balanced
dummy_clf.fit(upsample_features_train, upsample_target_train) #fit the dummy classifier to upsampled training set
dummy_pred = dummy_clf.predict(features_valid)

print('Dummy Classifier:')
print('')

#evaluate the dummy model
auc_roc(dummy_clf, features_valid, target_valid)

AUC-ROC score of 0.5, the dummy classifer has no ability to seperate between the two classes.

**Logistic Regression**

In [None]:
%%time
# Initialize the logistic regression model
lr = LogisticRegression(random_state=12345, max_iter=500)

# Parameter grid (exclude 'n_jobs' when solver='liblinear')
lr_params = {
    'penalty': ['l1', 'l2'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],  # Removed 'newton-cholesky' (not supported for LogisticRegression)
    'fit_intercept': [True, False]
}

# GridSearchCV
lr_clf = GridSearchCV(lr, lr_params, scoring='roc_auc', n_jobs=-1, cv=2)
lr_clf.fit(upsample_features_train, upsample_target_train)


In [None]:

print("Tuned Hyperparameters :", lr_clf.best_params_)
print("Accuracy :",lr_clf.best_score_)


In [None]:
auc_roc(lr_clf, features_valid, target_valid)

**KNN**

In [None]:
%%time
# Initialize KNN model
knn = KNeighborsClassifier()

# Parameter grid
knn_params = {
    'n_neighbors': (5, 20, 100, 200, 500),
    'p': np.arange(1, 3),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

knn_clf = GridSearchCV(knn, knn_params, scoring='roc_auc', cv=3)
knn_clf.fit(upsample_features_train, upsample_target_train)

In [None]:
print("Tuned Hyperparameters :", knn_clf.best_params_)
print("Accuracy :",knn_clf.best_score_)

In [None]:
auc_roc(knn_clf, features_valid, target_valid)

**Random Forest**

In [None]:
%%time
#initialize rf model
rf = RandomForestClassifier(random_state=12345)

#parameter grid
rf_params = {
    'n_estimators': [ 50, 100, 200],
    'criterion': ['gini', 'entropy'],
    'max_depth': [1, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

rf_clf = GridSearchCV(rf, param_grid=rf_params, scoring='roc_auc', cv=3)
rf_clf.fit(upsample_features_train, upsample_target_train)

In [None]:
print("Tuned Hyperparameters :", rf_clf.best_params_)
print("Accuracy :",rf_clf.best_score_)

In [None]:
auc_roc(rf_clf, features_valid, target_valid)

**Decision Tree**

In [None]:
%%time
#initialize model
dt = DecisionTreeClassifier(random_state=12345)

#parameter grid
dt_params = {'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'criterion': ["gini", "entropy", "log_loss"],
    'splitter': ['best', 'random']       
            }
#grid search
dt_clf = GridSearchCV(dt, param_grid=dt_params, cv=4, n_jobs=-1, verbose=1, scoring='roc_auc')
dt_clf.fit(upsample_features_train, upsample_target_train)

In [None]:
print("Tuned Hyperparameters :", dt_clf.best_params_)
print("Accuracy :",dt_clf.best_score_)

In [None]:
auc_roc(dt_clf, features_valid, target_valid)

---------------------------------------------------------------------------------------

**Boosting Models**

xgb

In [None]:
%%time
#initialize the xgb model
xgb = XGBClassifier(random_state=12345)

# Initialize StratifiedKFold
cv = StratifiedKFold(n_splits=3)

xgb_params = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [2, 5, 10],
    'subsample': [0.5, 0.7, 1]
}

xgb_clf = GridSearchCV(xgb, xgb_params, cv=3, scoring='roc_auc')
xgb_clf.fit(upsample_features_train, upsample_target_train)


In [None]:
print("Tuned Hyperparameters :", xgb_clf.best_params_)
print("Accuracy :",xgb_clf.best_score_)

In [None]:
auc_roc(xgb_clf, features_valid, target_valid)

In [None]:
#feature Importance
feat_imp = xgb_clf.best_estimator_.feature_importances_
plt.barh(range(len(feat_imp)), feat_imp)
plt.yticks(range(len(feat_imp)), features_train.columns)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.show()

light GBM

In [None]:
%%time
#initialize the lgbm model
lgbm = LGBMClassifier(random_state=12345)

#parameter grid
lgbm_params = {
    'boosting_type': ['gbdt', 'dart', 'rf'],
    'num_leaves': [1, 5, 10, 20, 50],
    'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.5, 0.75, 1],
    'feature_fraction': [0.1, 0.2, 0.5, 0.75, 1],
    'max_depth': [1, 5, 10, 20, 50],
    'min_data_in_leaf': [10, 25, 50, 100],
}

#grid search
lgbm_clf = GridSearchCV(lgbm, lgbm_params, scoring='roc_auc', cv=3, verbose=2)
#lgbm_clf.fit(upsample_features_train, upsample_target_train) #took appx 1 hour to train

#print("Tuned Hyperparameters :", lgbm_clf.best_params_)
#print("Accuracy :",lgbm_clf.best_score_)

Light GBM:

Tuned Hyperparameters : {'boosting_type': 'gbdt', 'feature_fraction': 1, 'learning_rate': 0.5, 'max_depth': 50, 'min_data_in_leaf': 10, 'num_leaves': 50}

Accuracy : 0.989400526359771

In [5]:
#auc_roc(lgbm_lcf, features_valid, target_valid)

Light GBM:

AUC - ROC Score: 0.9189008702134379 Accuracy Score for the Training Set: 0.8834399431414357

XGBM

In [None]:
#final parameter grid
final_params = {
    'learning_rate': 0.2, 
    'max_depth': 10, 
    'n_estimators': 300, 
    'subsample': 1
}

#initialize final model
fin_model = XGBClassifier(**final_params, random_state=12345)

#train final model
fin_model.fit(upsample_features_train, upsample_target_train)

In [None]:
#predict final model
target_pred = fin_model.predict(features_test)
target_pred_proba = fin_model.predict_proba(features_test)[:, 1]

In [None]:
#confusion matrix
cm = confusion_matrix(target_test, target_pred, labels=fin_model.classes_)

#plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=fin_model.classes_)
disp.plot()
plt.show()

#auc roc score for final model
print('AUC-ROC Score for Final Model:', roc_auc_score(target_test, target_pred_proba))
print('')

In [None]:
auc_roc(fin_model, features_test, target_test)