# Predict the churn risk rate - HackerEarth ML

# Step 1: Reading and Understanding the Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')
plt.style.use('seaborn-deep')
plt.style.use('fivethirtyeight')
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 10
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 8
plt.rcParams['ytick.labelsize'] = 8
plt.rcParams['legend.fontsize'] = 12
plt.rcParams['figure.titlesize'] = 14
plt.rcParams['figure.figsize'] = (12, 8)

pd.options.mode.chained_assignment = None
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 400)
import warnings
warnings.filterwarnings('ignore')
import sklearn.base as skb
import sklearn.metrics as skm
import sklearn.model_selection as skms
import sklearn.preprocessing as skp
import sklearn.utils as sku
import sklearn.linear_model as sklm
import sklearn.neighbors as skn
import sklearn.ensemble as ske
import catboost as cb
import scipy.stats as sstats
import random
seed = 12
np.random.seed(seed)

from datetime import date

In [None]:
!pip install pandas-profiling --quiet
import pandas_profiling as pp

In [None]:
# important funtions
def datasetShape(df):
    rows, cols = df.shape
    print("The dataframe has",rows,"rows and",cols,"columns.")
    
# select numerical and categorical features
def divideFeatures(df):
    numerical_features = df.select_dtypes(include=[np.number])
    categorical_features = df.select_dtypes(include=[np.object])
    return numerical_features, categorical_features

In [None]:
base = '/kaggle/input/churn-risk-rate-hackerearth-ml/'
data_file = base + "train.csv"
df = pd.read_csv(data_file)
df.head()

In [None]:
data_file = base + "test.csv"
df_test = pd.read_csv(data_file)
df_test.head()

In [None]:
# set target feature
targetFeature='churn_risk_score'

In [None]:
# check dataset shape
datasetShape(df)

In [None]:
# remove ID from train data
df.drop(['customer_id'], inplace=True, axis=1)

In [None]:
# check for duplicates
print(df.shape)
df.drop_duplicates(inplace=True)
print(df.shape)

In [None]:
df.info()

In [None]:
df_test.info()

In [None]:
df.describe()

# Step 2: EDA

In [None]:
cont_features, cat_features = divideFeatures(df)
cat_features.head()

### Univariate Analysis

In [None]:
# check target feature distribution
df[targetFeature].hist()
plt.show()

In [None]:
# boxplots of numerical features for outlier detection

fig = plt.figure(figsize=(16,16))
for i in range(len(cont_features.columns)):
    fig.add_subplot(3, 3, i+1)
    sns.boxplot(y=cont_features.iloc[:,i])
plt.tight_layout()
plt.show()

In [None]:
sns.pairplot(df)
plt.show()

In [None]:
# correlation heatmap for all features
corr = df.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr, mask = mask, annot=True)
plt.show()

### Profiling for Whole Data

In [None]:
profile = pp.ProfileReport(df, title='Pandas Profiling Report', explorative=True)
profile.to_file("profile.html")

In [None]:
profile.to_notebook_iframe()

# Step 3: Data Preparation

### Skewness

In [None]:
skewed_features = cont_features.apply(lambda x: x.skew()).sort_values(ascending=False)
skewed_features

### Handle Missing

In [None]:
# plot missing values

def calc_missing(df):
    missing = df.isna().sum().sort_values(ascending=False)
    missing = missing[missing != 0]
    missing_perc = missing/df.shape[0]*100
    return missing, missing_perc

if df.isna().any().sum()>0:
    missing, missing_perc = calc_missing(df)
    missing.plot(kind='bar',figsize=(14,5))
    plt.title('Missing Values')
    plt.show()
else:
    print("No Missing Values")

In [None]:
# remove all columns having no values
df.dropna(axis=1, how="all", inplace=True)
df.dropna(axis=0, how="all", inplace=True)
datasetShape(df)

In [None]:
def fillNan(df, col, value):
    df[col].fillna(value, inplace=True)

In [None]:
# setting missing values to most occurring values
fillNan(df, 'region_category', df['region_category'].mode()[0])
fillNan(df_test, 'region_category', df['region_category'].mode()[0])
df['region_category'].isna().any()

In [None]:
# setting missing values to most occurring values
fillNan(df, 'points_in_wallet', df['points_in_wallet'].mean())
fillNan(df_test, 'points_in_wallet', df['points_in_wallet'].mean())
df['points_in_wallet'].isna().any()

In [None]:
# setting missing values to most occurring values
fillNan(df, 'preferred_offer_types', df['preferred_offer_types'].mode()[0])
fillNan(df_test, 'preferred_offer_types', df['preferred_offer_types'].mode()[0])
df['preferred_offer_types'].isna().any()

In [None]:
# setting missing values to most occurring values
df['joined_through_referral'] = df['joined_through_referral'].apply(lambda x:'No' if x == '?' else x)
df_test['joined_through_referral'] = df_test['joined_through_referral'].apply(lambda x:'No' if x == '?' else x)
df['joined_through_referral'].unique()

In [None]:
# setting missing values to most occurring values
df['medium_of_operation'] = df['medium_of_operation'].apply(lambda x:'Desktop' if x == '?' else x)
df_test['medium_of_operation'] = df_test['medium_of_operation'].apply(lambda x:'Desktop' if x == '?' else x)
df['medium_of_operation'].unique()

In [None]:
# setting target wrong value -1 to 1 assuming sign issue, 
# and setting 5 to 0 for training after prediction revert it back to 5
df['churn_risk_score'] = df['churn_risk_score'].apply(lambda x:1 if x == -1 else 0 if x == 5 else x)
df['churn_risk_score'].unique()

In [None]:
# setting missing values to most occurring values
df['avg_frequency_login_days'] = df['avg_frequency_login_days'].apply(lambda x:0 if x == 'Error' else x)
df_test['avg_frequency_login_days'] = df_test['avg_frequency_login_days'].apply(lambda x:0 if x == 'Error' else x)
df['avg_frequency_login_days'] = pd.to_numeric(df['avg_frequency_login_days'])
df['avg_frequency_login_days'].describe()

In [None]:
# remove non-useful features
colsToRemove = ['Name', 'security_no', 'referral_id', 'last_visit_time']
df.drop(colsToRemove, inplace=True, axis=1)
df_test.drop(colsToRemove, inplace=True, axis=1)
df.head()

In [None]:
print("Train Missing:",df.isna().any().sum())
print("Test Missing:",df_test.isna().any().sum())

## Derive Features

In [None]:
df['joining_date'] = pd.to_datetime(df['joining_date'])
df_test['joining_date'] = pd.to_datetime(df_test['joining_date'])

In [None]:
df['days_since_joined'] = df['joining_date'].apply(lambda x:(pd.Timestamp('today') - x).days)
df_test['days_since_joined'] = df_test['joining_date'].apply(lambda x:(pd.Timestamp('today') - x).days)
df.head()

In [None]:
df.drop(['joining_date'], inplace=True, axis=1)
df_test.drop(['joining_date'], inplace=True, axis=1)
df.head()

## Create Dummy Features

In [None]:
cont_features, cat_features = divideFeatures(df)
cat_features

In [None]:
# label encoding on categorical features
def mapFeature(data, f, data_test=None):
    feat = data[f].unique()
    feat_idx = [x for x in range(len(feat))]

    data[f].replace(feat, feat_idx, inplace=True)
    if data_test is not None:
        data_test[f].replace(feat, feat_idx, inplace=True)

In [None]:
for col in cat_features.columns:
    mapFeature(df, col, df_test)
df_test.head()

### One-Hot Encoding
One hot encoding didn't work well.

In [None]:
# # extract numerical and categorical for dummy and scaling later
# custom_feat = ['feedback', 'complaint_status']
# # custom_feat = ['complaint_status']
# for feat in cat_features.columns:
#     if len(df[feat].unique()) > 2 and feat in custom_feat:
#         dummyVars = pd.get_dummies(df[feat], drop_first=True, prefix=feat+"_")
#         df = pd.concat([df, dummyVars], axis=1)
#         df.drop(feat, axis=1, inplace=True)
# datasetShape(df)

# df.head()

In [None]:
# # extract numerical and categorical for dummy and scaling later
# for feat in cat_features.columns:
#     if len(df_test[feat].unique()) > 2 and feat in custom_feat:
#         dummyVars = pd.get_dummies(df_test[feat], drop_first=True, prefix=feat+"_")
#         df_test = pd.concat([df_test, dummyVars], axis=1)
#         df_test.drop(feat, axis=1, inplace=True)
# datasetShape(df_test)

# df_test.head()

# Step 4: Data Modelling

### Split Train-Test Data

In [None]:
# helper functions

def log1p(vec):
    return np.log1p(abs(vec))

def expm1(x):
    return np.expm1(x)

def clipExp(vec):
    return np.clip(expm1(vec), 0, None)

def printScore(y_train, y_train_pred):
    print(skm.f1_score(y_train, y_train_pred, average="macro"))

In [None]:
# shuffle samples
df_shuffle = df.sample(frac=1, random_state=seed).reset_index(drop=True)

df_y = df_shuffle.pop(targetFeature)
df_X = df_shuffle

# split into train dev and test
X_train, X_test, y_train, y_test = skms.train_test_split(df_X, df_y, train_size=0.8, random_state=seed)
print(f"Train set has {X_train.shape[0]} records out of {len(df_shuffle)} which is {round(X_train.shape[0]/len(df_shuffle)*100)}%")
print(f"Test set has {X_test.shape[0]} records out of {len(df_shuffle)} which is {round(X_test.shape[0]/len(df_shuffle)*100)}%")

### Feature Scaling

In [None]:
cont_features.drop(targetFeature, inplace=True, axis=1)
cont_features.head()

In [None]:
# reset index for X_train and X_test
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
X_train.index[:5]

In [None]:
# scaler = skp.RobustScaler()
# scaler = skp.MinMaxScaler()
scaler = skp.StandardScaler()

# apply scaling to all numerical variables except dummy variables as they are already between 0 and 1
X_train[cont_features.columns] = pd.DataFrame(scaler.fit_transform(X_train[cont_features.columns]), columns=cont_features.columns)

# scale test data with transform()
X_test[cont_features.columns] = pd.DataFrame(scaler.transform(X_test[cont_features.columns]), columns=cont_features.columns)

# view sample data
X_train.describe()

## Model Building

In [None]:
class_weights = sku.class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
class_weights = dict(enumerate(class_weights))
class_weights

In [None]:
sample_weights = sku.class_weight.compute_sample_weight('balanced', y_train)
sample_weights

### CatBoost

In [None]:
import catboost as cb

cat_model = cb.CatBoostClassifier(verbose=0, iterations=100, 
#                                   eval_metric='F1', 
                                  class_weights=class_weights, 
#                                   use_best_model=True
                                 )
cat_model.fit(X_train, y_train, eval_set=(X_test, y_test))
print(cat_model.best_score_)

y_train_pred = cat_model.predict(X_train)
y_test_pred = cat_model.predict(X_test)
print(skm.accuracy_score(y_train, y_train_pred))
print(skm.accuracy_score(y_test, y_test_pred))
printScore(y_train, y_train_pred)
printScore(y_test, y_test_pred)

### RandomForest

In [None]:
rf_model = ske.RandomForestClassifier(verbose=0, random_state=1, n_jobs=-1, class_weight='balanced_subsample',
                                 n_estimators=100,max_depth=15, 
                                 min_samples_split = 5, min_samples_leaf = 1
                                )
rf_model.fit(X_train, y_train)

# predict
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)
print(skm.accuracy_score(y_train, y_train_pred))
print(skm.accuracy_score(y_test, y_test_pred))
printScore(y_train, y_train_pred)
printScore(y_test, y_test_pred)

### XGBoost

In [None]:
import xgboost as xg

In [None]:
# # Grid used for parameter tuning
# param_test1 = {
#     'max_depth': np.arange(5, 12, 2),
#     'learning_rate': np.arange(0.04, 0.07, 0.01)
# }
# xgb_cv1 = skms.GridSearchCV(estimator = xg.XGBClassifier(n_estimators=100, objective='multi:softprob', nthread=4, seed=seed), 
#                              param_grid = param_test1, scoring='f1', n_jobs=4, 
#                              cv=5, verbose=1)
# xgb_cv1.fit(X_train_small, y_train_small)
# print(xgb_cv1.best_params_, xgb_cv1.best_score_)
# # max_depth = 10
# # learning_rate = 0.04

In [None]:
# # Grid used for parameter tuning
# param_test2 = {
#  'subsample': np.arange(0.5, 1, 0.1),
#  'min_child_weight': range(1, 6, 1)
# }
# xgb_cv2 = skms.GridSearchCV(estimator = xg.XGBClassifier(n_estimators=500, max_depth = 10, 
#                                                      objective= 'multi:softprob', nthread=4, seed=seed), 
#                             param_grid = param_test2, scoring='f1', n_jobs=4,
#                             cv=5, verbose=1)
# xgb_cv2.fit(X_train_small, y_train_small)
# print(xgb_cv2.best_params_, xgb_cv2.best_score_)
# print(xgb_cv2.best_estimator_)
# # subsample = 0.5
# # min_child_weight = 2

In [None]:
xgb_model = xg.XGBClassifier(objective ='multi:softprob', random_state=seed, verbose=0, scoring='f1', 
                             learning_rate=0.001, subsample=0.5, n_jobs=-1, 
                             n_estimators=100, max_depth = 10)
xgb_model.fit(X_train, y_train)

# predict
y_train_pred = xgb_model.predict(X_train)
y_test_pred = xgb_model.predict(X_test)
print(skm.accuracy_score(y_train, y_train_pred))
print(skm.accuracy_score(y_test, y_test_pred))
printScore(y_train, y_train_pred)
printScore(y_test, y_test_pred)

### LightGBM

In [None]:
import lightgbm as lgb
lgb_model = lgb.LGBMClassifier(objective='multi', class_weight=class_weights, random_state=1, n_jobs=-1, 
                               learning_rate=0.15, 
                               n_estimators=100)
lgb_model.fit(X_train, y_train)

# predict
y_train_pred = lgb_model.predict(X_train)
y_test_pred = lgb_model.predict(X_test)
print(skm.accuracy_score(y_train, y_train_pred))
print(skm.accuracy_score(y_test, y_test_pred))
printScore(y_train, y_train_pred)
printScore(y_test, y_test_pred)

# Step 5: Test Evaluation & Submission

In [None]:
# Generate Ensembles

def rmse_cv(model):
    '''
    Use this function to get quickly the rmse score over a cv
    '''
    rmse = np.sqrt(-skms.cross_val_score(model, X_train, y_train, 
                                         scoring="neg_mean_squared_error", cv = 5, n_jobs=-1))
    return rmse

class MixModel(skb.BaseEstimator, skb.RegressorMixin, skb.TransformerMixin):
    '''
    Here we will get a set of models as parameter already trained and 
    will calculate the mean of the predictions for using each model predictions
    '''
    def __init__(self, algs):
        self.algs = algs

    # Define clones of parameters models
    def fit(self, X, y):
        self.algs_ = [skb.clone(x) for x in self.algs]
        
        # Train cloned base models
        for alg in self.algs_:
            alg.fit(X, y)

        return self
    
    # Average predictions of all cloned models
    def predict(self, X):
        predictions = np.column_stack([
            stacked_model.predict(X) for stacked_model in self.algs_
        ])
        return np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=predictions)

In [None]:
mixed_model = MixModel(algs = [
    cat_model,
    rf_model,
    xgb_model,
    lgb_model
])
# score = rmse_cv(mixed_model)
# print("\nAveraged base algs score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

mixed_model.fit(X_train, y_train)

# predict
y_train_pred = mixed_model.predict(X_train)
y_test_pred = mixed_model.predict(X_test)
printScore(y_train, y_train_pred)
printScore(y_test, y_test_pred)

In [None]:
def getTestResults():
    df_final = df.sample(frac=1, random_state=1).reset_index(drop=True)
    test_cols = [x for x in df_final.columns if targetFeature not in x]
    df_final_test = df_test[test_cols]
    df_y = df_final.pop(targetFeature)
    df_X = df_final

    scaler = skp.RobustScaler()
#     scaler = skp.MinMaxScaler()
#     scaler = skp.StandardScaler()

    df_X[cont_features.columns] = pd.DataFrame(scaler.fit_transform(df_X[cont_features.columns]), columns=cont_features.columns)
    df_final_test[cont_features.columns] = pd.DataFrame(scaler.transform(df_final_test[cont_features.columns]), columns=cont_features.columns)

#     sample_weights = sku.class_weight.compute_sample_weight('balanced', df_y)
    
    model = MixModel(algs = [
        cat_model,
        rf_model,
        xgb_model,
        lgb_model
    ])

    model.fit(df_X, df_y)

    # predict
    y_train_pred = model.predict(df_X)
    y_test_pred = model.predict(df_final_test)
    print("Accuracy Score for Train:",skm.accuracy_score(df_y, y_train_pred))
    printScore(df_y, y_train_pred)
    return y_test_pred

# ML models
results = getTestResults()

In [None]:
submission = pd.DataFrame({
    'customer_id': df_test['customer_id'],
    targetFeature: results.ravel(),
})
print(submission[targetFeature].value_counts())

In [None]:
# revert back 0 to 5 for predictions
submission[targetFeature] = submission[targetFeature].apply(lambda x:5 if x == 0 else x)
submission[targetFeature].value_counts()

In [None]:
submission.to_csv('./submission_Ensemble4.csv', index=False)

With this ensemble of four best classifiers, 76.56 LB is scored.