In [1]:
import pandas as pd
pd.set_option("display.max_columns",200)

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

import warnings
warnings.filterwarnings("ignore")

import missingno as msno

In [2]:
train = pd.read_csv(r"..\data\train.csv")
print(f"The shape of train data: {train.shape}")
test = pd.read_csv(r"..\data\test.csv")
print(f"The shape of test data: {test.shape}")

The shape of train data: (69999, 172)
The shape of test data: (30000, 171)


In [3]:
# exploring object columns
obj_cols = train.select_dtypes(include=['object']).columns
train[obj_cols].sample(5)

Unnamed: 0,last_date_of_month_6,last_date_of_month_7,last_date_of_month_8,date_of_last_rech_6,date_of_last_rech_7,date_of_last_rech_8,date_of_last_rech_data_6,date_of_last_rech_data_7,date_of_last_rech_data_8
37094,6/30/2014,7/31/2014,8/31/2014,6/25/2014,7/24/2014,8/6/2014,6/23/2014,,8/2/2014
8446,6/30/2014,7/31/2014,8/31/2014,6/29/2014,7/25/2014,8/30/2014,,,
50628,6/30/2014,7/31/2014,8/31/2014,6/28/2014,7/28/2014,8/19/2014,,,
36712,6/30/2014,7/31/2014,8/31/2014,6/30/2014,7/29/2014,8/31/2014,,,8/26/2014
31223,6/30/2014,7/31/2014,8/31/2014,6/6/2014,7/30/2014,8/31/2014,,,


In [4]:
# convert datetime columns from object to datetime
train[obj_cols] = train[obj_cols].apply(pd.to_datetime)
train.info()

# convert datetime columns from object to datetime in test data
test[obj_cols] = test[obj_cols].apply(pd.to_datetime)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69999 entries, 0 to 69998
Columns: 172 entries, id to churn_probability
dtypes: datetime64[ns](9), float64(135), int64(28)
memory usage: 91.9 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Columns: 171 entries, id to jun_vbc_3g
dtypes: datetime64[ns](9), float64(135), int64(27)
memory usage: 39.1 MB


In [5]:
# understanding the data missing in terms of proportions
missing_data_percent = 100*train.isnull().sum()/len(train)

# len(missing_data_percent[missing_data_percent.ge(73)].index) # 30
# len(missing_data_percent[(missing_data_percent.ge(5))&(missing_data_percent.le(5.3))].index) # 29
# len(missing_data_percent[(missing_data_percent.gt(0))&(missing_data_percent.lt(5))].index) # 66
# len(missing_data_percent[missing_data_percent.eq(0)].index) # 47

# missing data in test
missing_data_percent_test = 100*test.isnull().sum()/len(test)

# len(missing_data_percent_test[missing_data_percent_test.ge(73)].index) # 30
# len(missing_data_percent_test[(missing_data_percent_test.ge(5))&(missing_data_percent_test.le(6))].index) # 29
# len(missing_data_percent_test[(missing_data_percent_test.gt(0))&(missing_data_percent_test.lt(5))].index) # 66
# len(missing_data_percent_test[missing_data_percent_test.eq(0)].index) # 46

# checking if the columns we are about to drop are same in both test and train
# even if not same, we will drop the columns in test that we are dropping from train
assert (
    missing_data_percent_test[missing_data_percent_test.ge(73)].index == 
    missing_data_percent[missing_data_percent.ge(73)].index
).all()

In [6]:
# dropping 30 variables with data missing more than 73%
cols_to_include = missing_data_percent[missing_data_percent.lt(6)].index

# dropping from test
test = test[list(set(cols_to_include) - set(["churn_probability"]))]

# dropping from train
train = train[cols_to_include]
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69999 entries, 0 to 69998
Columns: 142 entries, id to churn_probability
dtypes: datetime64[ns](6), float64(108), int64(28)
memory usage: 75.8 MB


In [7]:
# identifying categorical and numeric columns
datetime_cols = train.select_dtypes(include=['datetime64[ns]']).columns
num_cols = train.select_dtypes(include=['float64', 'int64']).columns

In [8]:
# based on the skew and outliers we will decide if we have to use median or mean to fill the missing data
def imputation_metric(num_cols, df):
    strategy = {}
    for col in num_cols:
        skewness = skew(df[col].dropna())
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        outliers = ((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))).sum()
        
        if abs(skewness) > 1 or outliers > 0.05 * len(df):
            strategy[col] = 'median'
        else:
            strategy[col] = 'mean'
    return strategy

imputation_strategy_train = imputation_metric(num_cols, train)
imputation_strategy_test = imputation_metric(test.select_dtypes(include=['float64', 'int64']).columns, test)

In [9]:
# filling the missing numeric columns based on outliers and skewness
for col, strat in imputation_strategy_train.items():
    imputer = SimpleImputer(strategy=strat)
    train[[col]] = imputer.fit_transform(train[[col]])

for col, strat in imputation_strategy_test.items():
    imputer = SimpleImputer(strategy=strat)
    test[[col]] = imputer.fit_transform(test[[col]])

In [10]:
def create_features(df):
    # total recharge amount for 3 months
    df['total_rech_amt_sum'] = df['total_rech_amt_6'] + df['total_rech_amt_7'] + df['total_rech_amt_8']

    # average recharge amount for 3 months
    df['avg_rech_amt_per_month'] = df[['total_rech_amt_6', 'total_rech_amt_7', 'total_rech_amt_8']].mean(axis=1)

    # usage ratio of 2g
    df['vol_2g_ratio'] = df['vol_2g_mb_6'] / (df['vol_3g_mb_6'] + 1)

    # usage ratio of 3g
    df['vol_3g_ratio'] = df['vol_3g_mb_6'] / (df['vol_2g_mb_6'] + 1)

    # total usage minutes - outgoing
    df['total_og_mou_sum'] = df['total_og_mou_6'] + df['total_og_mou_7'] + df['total_og_mou_8']

    # total usage minutes - incoming
    df['total_ic_mou_sum'] = df['total_ic_mou_6'] + df['total_ic_mou_7'] + df['total_ic_mou_8']

    # average usage per month - outgoing
    df['avg_og_mou_per_month'] = df[['total_og_mou_6', 'total_og_mou_7', 'total_og_mou_8']].mean(axis=1)

    # average usage per month - incoming
    df['avg_ic_mou_per_month'] = df[['total_ic_mou_6', 'total_ic_mou_7', 'total_ic_mou_8']].mean(axis=1)

    # roaming usage ratio - outgoing
    df['roam_og_ratio_6'] = df['roam_og_mou_6'] / (df['total_og_mou_6'] + 1)
    df['roam_og_ratio_7'] = df['roam_og_mou_7'] / (df['total_og_mou_7'] + 1)
    df['roam_og_ratio_8'] = df['roam_og_mou_8'] / (df['total_og_mou_8'] + 1)

    # roaming usage ratio - incoming
    df['roam_ic_ratio_6'] = df['roam_ic_mou_6'] / (df['total_ic_mou_6'] + 1)
    df['roam_ic_ratio_7'] = df['roam_ic_mou_7'] / (df['total_ic_mou_7'] + 1)
    df['roam_ic_ratio_8'] = df['roam_ic_mou_8'] / (df['total_ic_mou_8'] + 1)

    # total data used
    df['total_data_mb_sum'] = df['vol_2g_mb_6'] + df['vol_2g_mb_7'] + df['vol_2g_mb_8'] + df['vol_3g_mb_6'] + df['vol_3g_mb_7'] + df['vol_3g_mb_8']

    # average data used per month
    df['avg_data_mb_per_month'] = df[['vol_2g_mb_6', 'vol_2g_mb_7', 'vol_2g_mb_8', 'vol_3g_mb_6', 'vol_3g_mb_7', 'vol_3g_mb_8']].mean(axis=1)

    return df

train = create_features(train)
test = create_features(test)

In [11]:
def drop_negative_outliers(df):
    for col in ['arpu_6','arpu_7','arpu_8']:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        df = df[df[col] >= lower_bound]
    return df

train = drop_negative_outliers(train)
# test = drop_negative_outliers(test)

In [12]:
# log transformation of arpu for each month
def log_transform(df):
    df['arpu_6_log'] = np.log1p(df['arpu_6'].apply(lambda x: x if x > 0 else 1))
    df['arpu_7_log'] = np.log1p(df['arpu_7'].apply(lambda x: x if x > 0 else 1))
    df['arpu_8_log'] = np.log1p(df['arpu_8'].apply(lambda x: x if x > 0 else 1))
    return df

train = log_transform(train)
test = log_transform(test)

In [13]:
# scaling the newly added totals, averages, ratios
scaler = StandardScaler()
added_columns = ['total_og_mou_sum', 'total_ic_mou_sum', 'avg_og_mou_per_month', 'avg_ic_mou_per_month',
                     'roam_og_ratio_6', 'roam_og_ratio_7', 'roam_og_ratio_8', 'roam_ic_ratio_6', 
                     'roam_ic_ratio_7', 'roam_ic_ratio_8', 'total_rech_amt_sum', 'avg_rech_amt_per_month',
                     'total_data_mb_sum', 'avg_data_mb_per_month', 'vol_2g_ratio', 'vol_3g_ratio']
train[added_columns] = scaler.fit_transform(train[added_columns])
test[added_columns] = scaler.fit_transform(test[added_columns])

In [14]:
X = train.drop(columns=['churn_probability']).drop(columns=datetime_cols)
y = train['churn_probability']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


sfm = SelectFromModel(model, threshold='mean', prefit=True)
X_train_selected = sfm.transform(X_train)
selected_features = X_train.columns[sfm.get_support(indices=True)]


X_valid_selected = sfm.transform(X_valid)
X_test_selected = sfm.transform(test.drop(columns=datetime_cols))

X_train_selected = pd.DataFrame(X_train_selected, columns=selected_features)
X_valid_selected = pd.DataFrame(X_valid_selected, columns=selected_features)
X_test_selected = pd.DataFrame(X_test_selected, columns=selected_features)

In [32]:
# imbalanced class
X_train_new = pd.concat([X_train_selected,X_valid_selected])
y_train_new = pd.concat([y_train,y_valid])

(100*y_train_new.value_counts(normalize=True)).round()

churn_probability
0.0    90.0
1.0    10.0
Name: proportion, dtype: float64

In [33]:
rfc = RandomForestClassifier(class_weight="balanced", max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=100)

# validation
rfc.fit(X_train_selected, y_train)
valid_pred_proba = rfc.predict_proba(X_valid_selected)[:, 1]
valid_auc = roc_auc_score(y_valid, valid_pred_proba)
print(f"Validation ROC AUC: {valid_auc}")

# fitting model on whole data
X_train_new = pd.concat([X_train_selected, X_valid_selected])
y_train_new = pd.concat([y_train,y_valid])
rfc.fit(X_train_new, y_train_new)

# predictions
y_pred_test = rfc.predict_proba(X_test_selected)[:, 1]

# saving predictions
submission = test[['id']].copy()
submission['churn_probability'] = y_pred_test
submission.id = submission.id.astype('int64')
submission.churn_probability = submission.churn_probability.astype('int64')
submission.to_csv(r'..\data\churn_predictions_iteration_3_balancedrfc.csv', index=False)

Validation ROC AUC: 0.9350955806312387


In [36]:
# random forest classifier
rf_param_grid = {
    'class_weight': ['balanced','balanced_subsample'],
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestClassifier(random_state=42)
rf_grid_search = RandomizedSearchCV(estimator=rf_model, param_distributions=rf_param_grid, cv=5, scoring='roc_auc', verbose=1)
rf_grid_search.fit(X_train_selected, y_train)

print("Random Forest - Best Parameters:", rf_grid_search.best_params_)
print("Random Forest - Best Score:", rf_grid_search.best_score_)

# running best estimator
best_model = rf_grid_search.best_estimator_

# validation
best_model.fit(X_train_selected, y_train)
valid_pred_proba = best_model.predict_proba(X_valid_selected)[:, 1]
valid_auc = roc_auc_score(y_valid, valid_pred_proba)
print(f"Validation ROC AUC: {valid_auc}")

# fitting model on whole data
X_train_new = pd.concat([X_train_selected, X_valid_selected])
y_train_new = pd.concat([y_train,y_valid])
best_model.fit(X_train_new, y_train_new)

# predictions
y_pred_test = best_model.predict_proba(X_test_selected)[:, 1]

# saving predictions
submission = test[['id']].copy()
submission['churn_probability'] = y_pred_test
submission.id = submission.id.astype('int64')
submission.churn_probability = submission.churn_probability.astype('int64')
submission.to_csv(r'..\data\churn_predictions_iteration_3_balancedrfc2.csv', index=False)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Random Forest - Best Parameters: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 10, 'class_weight': 'balanced_subsample'}
Random Forest - Best Score: 0.9372398775719747
Validation ROC AUC: 0.9356266977411983


In [37]:
# random forest classifier
rf_param_grid = {
    'class_weight': ['balanced','balanced_subsample'],
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestClassifier(random_state=42)
rf_grid_search = RandomizedSearchCV(estimator=rf_model, param_distributions=rf_param_grid, cv=5, scoring='f1_weighted', verbose=1)
rf_grid_search.fit(X_train_selected, y_train)

print("Random Forest - Best Parameters:", rf_grid_search.best_params_)
print("Random Forest - Best Score:", rf_grid_search.best_score_)

# running best estimator
best_model = rf_grid_search.best_estimator_

# validation
best_model.fit(X_train_selected, y_train)
valid_pred_proba = best_model.predict_proba(X_valid_selected)[:, 1]
valid_auc = roc_auc_score(y_valid, valid_pred_proba)
print(f"Validation ROC AUC: {valid_auc}")

# fitting model on whole data
X_train_new = pd.concat([X_train_selected, X_valid_selected])
y_train_new = pd.concat([y_train,y_valid])
best_model.fit(X_train_new, y_train_new)

# predictions
y_pred_test = best_model.predict_proba(X_test_selected)[:, 1]

# saving predictions
submission = test[['id']].copy()
submission['churn_probability'] = y_pred_test
submission.id = submission.id.astype('int64')
submission.churn_probability = submission.churn_probability.astype('int64')
submission.to_csv(r'..\data\churn_predictions_iteration_3_balancedrfc3.csv', index=False)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Random Forest - Best Parameters: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 20, 'class_weight': 'balanced_subsample'}
Random Forest - Best Score: 0.9381074405358264
Validation ROC AUC: 0.9303273050087235


In [38]:
# random forest classifier
rf_param_grid = {
    'class_weight': ['balanced','balanced_subsample'],
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestClassifier(random_state=42)
rf_grid_search = RandomizedSearchCV(estimator=rf_model, param_distributions=rf_param_grid, cv=5, scoring='balanced_accuracy', verbose=1)
rf_grid_search.fit(X_train_selected, y_train)

print("Random Forest - Best Parameters:", rf_grid_search.best_params_)
print("Random Forest - Best Score:", rf_grid_search.best_score_)

# running best estimator
best_model = rf_grid_search.best_estimator_

# validation
best_model.fit(X_train_selected, y_train)
valid_pred_proba = best_model.predict_proba(X_valid_selected)[:, 1]
valid_auc = roc_auc_score(y_valid, valid_pred_proba)
print(f"Validation ROC AUC: {valid_auc}")

# fitting model on whole data
X_train_new = pd.concat([X_train_selected, X_valid_selected])
y_train_new = pd.concat([y_train,y_valid])
best_model.fit(X_train_new, y_train_new)

# predictions
y_pred_test = best_model.predict_proba(X_test_selected)[:, 1]

# saving predictions
submission = test[['id']].copy()
submission['churn_probability'] = y_pred_test
submission.id = submission.id.astype('int64')
submission.churn_probability = submission.churn_probability.astype('int64')
submission.to_csv(r'..\data\churn_predictions_iteration_3_balancedrfc4.csv', index=False)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Random Forest - Best Parameters: {'n_estimators': 50, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 10, 'class_weight': 'balanced_subsample'}
Random Forest - Best Score: 0.8527238730413454
Validation ROC AUC: 0.9340511775504322
