In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

from lightgbm import LGBMClassifier


import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)


df = pd.read_csv("/kaggle/input/churn-risk-rate-hackerearth-ml/train.csv")
df.head(3)

In [None]:
del df["customer_id"]
del df["Name"]
del df["security_no"]
del df["referral_id"]
del df["last_visit_time"]


In [None]:
df.churn_risk_score.value_counts(normalize=True)

In [None]:
negative_target = df[df["churn_risk_score"] == -1].index
df.drop(index= negative_target, inplace=True)
df.churn_risk_score.value_counts(normalize=True)

In [None]:
sns.countplot(data=df, x="churn_risk_score")

In [None]:
df.nunique().sort_values()

In [None]:
yes_map = {"Yes":1, "No":0}

df["past_complaint"] = df["past_complaint"].map(yes_map)
df["offer_application_preference"] = df["offer_application_preference"].map(yes_map)
df["used_special_discount"] = df["used_special_discount"].map(yes_map)


In [None]:
gender_map = {"F":1, "Unknown":0, "M":-1}

df["gender"] = df["gender"].map(gender_map)


In [None]:
df["region_category"].fillna("Missing", inplace=True)

df["preferred_offer_types"].fillna("Missing", inplace=True)


In [None]:
join_map = {"Yes":1, "?":0, "No":-1}

df["joined_through_referral"] = df["joined_through_referral"].map(join_map)


In [None]:
opt_map = {"Fiber_Optic":-1, "Mobile_Data":0, "Wi-Fi":1}

df["internet_option"] = df["internet_option"].map(opt_map)


In [None]:
df["joining_date"] = pd.to_datetime(df["joining_date"])
df["user_history"] = 2017 - df["joining_date"].dt.year 
df.drop("joining_date", axis=1, inplace=True)
df["user_history"]

In [None]:
df["avg_frequency_login_days"] = df["avg_frequency_login_days"].replace("Error", -999)


In [None]:
df["avg_frequency_login_days"] = df["avg_frequency_login_days"].astype("float")


In [None]:
df["points_in_wallet"].fillna(-999, inplace=True)


In [None]:
from sklearn.model_selection import train_test_split

X = df.drop("churn_risk_score", axis=1)
y = df["churn_risk_score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size= 0.2, 
                                                    random_state=42, 
                                                    stratify=y, 
                                                    shuffle=True)

In [None]:
column_list = ["age", "avg_time_spent", "avg_transaction_value", "avg_frequency_login_days", "points_in_wallet"]

In [None]:
def winsorize(column_list):
    for i in column_list:
        lower_boundary, upper_boundary = np.percentile(X_train[i], [5, 95])
        X_train[i] = np.clip(X_train[i], lower_boundary, upper_boundary)
        X_test[i] = np.clip(X_test[i], lower_boundary, upper_boundary)


In [None]:
winsorize(column_list)

In [None]:
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)


In [None]:
eval_set = [(X_train, y_train),(X_test, y_test)]

lgbm = LGBMClassifier(objective="multiclass", 
                      n_estimators=1000, 
                      subsample=0.8, 
                      colsample_bytree=0.6, 
                      max_depth=7, 
                      num_leaves=127,
                     learning_rate=0.03,
                     min_child_weight=12)

lgbm.fit(X_train, y_train, eval_set=eval_set, early_stopping_rounds=5, verbose=False, eval_metric="logloss")


y_pred_test = lgbm.predict(X_test)
y_pred_train = lgbm.predict(X_train)

print("train: \n", classification_report(y_train, y_pred_train))
print("test: \n", classification_report(y_test, y_pred_test))
print("Train f1: \n", f1_score(y_train, y_pred_train, average="macro"))
print("Test f1: \n", f1_score(y_test, y_pred_test, average="macro"))

In [None]:
feature_importances = pd.DataFrame(zip(X_train.columns, lgbm.feature_importances_))
feature_importances.sort_values(by=1, ascending=True)[0:10]

In [None]:
del X_train["preferred_offer_types_Missing"]
del X_test["preferred_offer_types_Missing"]

del X_train["complaint_status_Unsolved"]
del X_test["complaint_status_Unsolved"]

del X_train["medium_of_operation_Both"]
del X_test["medium_of_operation_Both"]

del X_train["complaint_status_Solved in Follow-up"]
del X_test["complaint_status_Solved in Follow-up"]

del X_train["complaint_status_No Information Available"]
del X_test["complaint_status_No Information Available"]

del X_train["region_category_Village"]
del X_test["region_category_Village"]

del X_train["region_category_Missing"]
del X_test["region_category_Missing"]

del X_train["complaint_status_Solved"]
del X_test["complaint_status_Solved"]

del X_train["medium_of_operation_?"]
del X_test["medium_of_operation_?"]

del X_train["complaint_status_Not Applicable"]
del X_test["complaint_status_Not Applicable"]

In [None]:
eval_set = [(X_train, y_train),(X_test, y_test)]

lgbm = LGBMClassifier(objective="multiclass", 
                      n_estimators=1000, 
                      subsample=0.8, 
                      colsample_bytree=0.6, 
                      max_depth=7, 
                      num_leaves=127,
                     learning_rate=0.03,
                     min_child_weight=12)

lgbm.fit(X_train, y_train, eval_set=eval_set, early_stopping_rounds=5, verbose=False, eval_metric="logloss")


y_pred_test = lgbm.predict(X_test)
y_pred_train = lgbm.predict(X_train)

print("train: \n", classification_report(y_train, y_pred_train))
print("test: \n", classification_report(y_test, y_pred_test))
print("Train f1: \n", f1_score(y_train, y_pred_train, average="macro"))
print("Test f1: \n", f1_score(y_test, y_pred_test, average="macro"))

In [None]:
feature_importances = pd.DataFrame(zip(X_train.columns, lgbm.feature_importances_))
feature_importances.sort_values(by=1, ascending=True)[0:10]

In [None]:
del X_train["medium_of_operation_Desktop"]
del X_test["medium_of_operation_Desktop"]

del X_train["preferred_offer_types_Gift Vouchers/Coupons"]
del X_test["preferred_offer_types_Gift Vouchers/Coupons"]

del X_train["preferred_offer_types_Without Offers"]
del X_test["preferred_offer_types_Without Offers"]

del X_train["medium_of_operation_Smartphone"]
del X_test["medium_of_operation_Smartphone"]

del X_train["preferred_offer_types_Credit/Debit Card Offers"]
del X_test["preferred_offer_types_Credit/Debit Card Offers"]

del X_train["region_category_Town"]
del X_test["region_category_Town"]

del X_train["region_category_City"]
del X_test["region_category_City"]

del X_train["offer_application_preference"]
del X_test["offer_application_preference"]

del X_train["used_special_discount"]
del X_test["used_special_discount"]

del X_train["feedback_No reason specified"]
del X_test["feedback_No reason specified"]

In [None]:
eval_set = [(X_train, y_train),(X_test, y_test)]

lgbm = LGBMClassifier(objective="multiclass", 
                      n_estimators=1000, 
                      subsample=0.7, 
                      colsample_bytree=0.5, 
                      max_depth=8, 
                      num_leaves=255,
                     learning_rate=0.01,
                     min_child_weight=200,
                     reg_alpha=2,
                     reg_lambda=3)

lgbm.fit(X_train, y_train, eval_set=eval_set, early_stopping_rounds=3, verbose=False, eval_metric="logloss")


y_pred_test = lgbm.predict(X_test)
y_pred_train = lgbm.predict(X_train)

print("train: \n", classification_report(y_train, y_pred_train))
print("test: \n", classification_report(y_test, y_pred_test))
print("Train f1: \n", f1_score(y_train, y_pred_train, average="macro"))
print("Test f1: \n", f1_score(y_test, y_pred_test, average="macro"))