In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

### checking missing %

In [3]:
miss_cols_train=train.columns[train.isna().sum()>0]
miss_cols_test=test.columns[test.isna().sum()>0]

In [4]:
round(train[miss_cols_train].isna().sum().sort_values(ascending=False)/len(train),3)

region_category          0.147
points_in_wallet         0.093
preferred_offer_types    0.008
dtype: float64

In [5]:
round(test[miss_cols_test].isna().sum().sort_values(ascending=False)/len(train),3)

region_category          0.080
points_in_wallet         0.053
preferred_offer_types    0.004
dtype: float64

In [6]:
train.shape, test.shape

((36992, 25), (19919, 24))

In [7]:
train.head().T

Unnamed: 0,0,1,2,3,4
customer_id,fffe4300490044003600300030003800,fffe43004900440032003100300035003700,fffe4300490044003100390032003600,fffe43004900440036003000330031003600,fffe43004900440031003900350030003600
Name,Pattie Morrisey,Traci Peery,Merideth Mcmeen,Eufemia Cardwell,Meghan Kosak
age,18,32,44,37,31
gender,F,F,F,M,F
security_no,XW0DQ7H,5K0N3X1,1F2TCL3,VJGJ33N,SVZXCWB
region_category,Village,City,Town,City,City
membership_category,Platinum Membership,Premium Membership,No Membership,No Membership,No Membership
joining_date,2017-08-17,2017-08-28,2016-11-11,2016-10-29,2017-09-12
joined_through_referral,No,?,Yes,Yes,No
referral_id,xxxxxxxx,CID21329,CID12313,CID3793,xxxxxxxx


### treating joined_through_referral

In [8]:
train['joined_through_referral'].value_counts()

No     15839
Yes    15715
?       5438
Name: joined_through_referral, dtype: int64

In [9]:
train['referral_id'][train['joined_through_referral']=='?'].value_counts()

xxxxxxxx    561
CID15792      5
CID57328      4
CID1601       4
CID52976      4
           ... 
CID42136      1
CID39430      1
CID25288      1
CID61365      1
CID15024      1
Name: referral_id, Length: 4174, dtype: int64

In [10]:
train.loc[(train['referral_id']=="xxxxxxxx") & (train['joined_through_referral']=="?"),'joined_through_referral']="No"
train.loc[(train['referral_id']!="xxxxxxxx") & (train['joined_through_referral']=="?"),'joined_through_referral']="Yes"

In [11]:
test.loc[(test['referral_id']=="xxxxxxxx") & (test['joined_through_referral']=="?"),'joined_through_referral']="No"
test.loc[(test['referral_id']!="xxxxxxxx") & (test['joined_through_referral']=="?"),'joined_through_referral']="Yes"

### dropping useless columns

In [12]:
test_ids=test['customer_id']

In [13]:
train=train.drop(columns=['Name','security_no','referral_id','last_visit_time','customer_id'])
test=test.drop(columns=['Name','security_no','referral_id','last_visit_time','customer_id'])

### checking categorical distributions

In [14]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36992 entries, 0 to 36991
Data columns (total 20 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   age                           36992 non-null  int64  
 1   gender                        36992 non-null  object 
 2   region_category               31564 non-null  object 
 3   membership_category           36992 non-null  object 
 4   joining_date                  36992 non-null  object 
 5   joined_through_referral       36992 non-null  object 
 6   preferred_offer_types         36704 non-null  object 
 7   medium_of_operation           36992 non-null  object 
 8   internet_option               36992 non-null  object 
 9   days_since_last_login         36992 non-null  int64  
 10  avg_time_spent                36992 non-null  float64
 11  avg_transaction_value         36992 non-null  float64
 12  avg_frequency_login_days      36992 non-null  object 
 13  p

### segregrating categorical and numerical columns

In [15]:
cat=[]
num=[]
for i in test.columns:
    if test[i].dtype=='O':
        cat.append(i)
    else:
        num.append(i)

In [16]:
cat

['gender',
 'region_category',
 'membership_category',
 'joining_date',
 'joined_through_referral',
 'preferred_offer_types',
 'medium_of_operation',
 'internet_option',
 'avg_frequency_login_days',
 'used_special_discount',
 'offer_application_preference',
 'past_complaint',
 'complaint_status',
 'feedback']

In [17]:
num

['age',
 'days_since_last_login',
 'avg_time_spent',
 'avg_transaction_value',
 'points_in_wallet']

### checking categorical distribution

In [18]:
for i in cat:
    print("\n",i,"\n","-"*40)
    print(train[i].value_counts())


 gender 
 ----------------------------------------
F          18490
M          18443
Unknown       59
Name: gender, dtype: int64

 region_category 
 ----------------------------------------
Town       14128
City       12737
Village     4699
Name: region_category, dtype: int64

 membership_category 
 ----------------------------------------
Basic Membership       7724
No Membership          7692
Gold Membership        6795
Silver Membership      5988
Premium Membership     4455
Platinum Membership    4338
Name: membership_category, dtype: int64

 joining_date 
 ----------------------------------------
2015-06-02    55
2015-07-04    51
2015-06-21    50
2016-08-03    49
2015-06-26    49
              ..
2016-03-16    19
2016-06-03    18
2015-09-12    18
2017-07-03    18
2015-03-04    16
Name: joining_date, Length: 1096, dtype: int64

 joined_through_referral 
 ----------------------------------------
Yes    20592
No     16400
Name: joined_through_referral, dtype: int64

 preferred_offer_

### checking numerical distribution

In [19]:
train.describe(percentiles=[.01,.1,.25,.5,.75,.9,.99])

Unnamed: 0,age,days_since_last_login,avg_time_spent,avg_transaction_value,points_in_wallet,churn_risk_score
count,36992.0,36992.0,36992.0,36992.0,33549.0,36992.0
mean,37.118161,-41.915576,243.472334,29271.194003,686.882199,3.463397
std,15.867412,228.8199,398.289149,19444.806226,194.063624,1.409661
min,10.0,-999.0,-2814.10911,800.46,-760.661236,-1.0
1%,10.0,-999.0,-948.781579,1325.1279,113.627063,-1.0
10%,15.0,3.0,32.29,6198.329,512.168,1.0
25%,23.0,8.0,60.1025,14177.54,616.15,3.0
50%,37.0,12.0,161.765,27554.485,697.62,4.0
75%,51.0,16.0,356.515,40855.11,763.95,5.0
90%,59.0,20.0,702.668,48789.279,829.004,5.0


### treating churn_risk_score

In [20]:
train['churn_risk_score'].value_counts()

 3    10424
 4    10185
 5     9827
 2     2741
 1     2652
-1     1163
Name: churn_risk_score, dtype: int64

In [21]:
train=train[train['churn_risk_score']!=-1]

### converting date to days_since and dropping dates

In [22]:
import datetime

In [23]:
train_date=pd.to_datetime(train['joining_date'])
test_date=pd.to_datetime(test['joining_date'])
max_date=max(pd.to_datetime(train['joining_date']))

In [24]:
train_days=(max_date-train_date).apply(lambda x: x.days)
test_days=(max_date-test_date).apply(lambda x: x.days)

In [25]:
train['days_since_join']=train_days
test['days_since_join']=test_days

In [26]:
train.drop(columns='joining_date',inplace=True)
test.drop(columns='joining_date',inplace=True)

### treating avg_frequency_login_days

In [27]:
train['avg_frequency_login_days'].replace({"Error":"-999"},inplace=True)
test['avg_frequency_login_days'].replace({"Error":"-999"},inplace=True)

In [28]:
train['avg_frequency_login_days']=train['avg_frequency_login_days'].astype('float')
test['avg_frequency_login_days']=test['avg_frequency_login_days'].astype('float')

train['avg_frequency_login_days']=train['avg_frequency_login_days'].astype('int32')
test['avg_frequency_login_days']=test['avg_frequency_login_days'].astype('int32')

### treating missing values

In [29]:
train.fillna({'region_category':'Other', 'preferred_offer_types':'Other', 'points_in_wallet':train['points_in_wallet'].mean()}, inplace=True)
test.fillna({'region_category':'Other', 'preferred_offer_types':'Other', 'points_in_wallet':train['points_in_wallet'].mean()}, inplace=True)

### creating new features

In [30]:
def feed(val):
    if val in ['Poor Product Quality','Too many ads','Poor Website','Poor Customer Service']:
        return "Negative"
    elif val in ['No reason specified']:
        return "Unknown"
    else:
        return "Positive"

In [31]:
train['feed']=train['feedback'].apply(lambda x:feed(x))

In [32]:
test['feed']=test['feedback'].apply(lambda x:feed(x))

In [33]:
train['internet_option'].replace({'Wi-Fi':2,'Mobile_Data':1,'Fiber_Optic':3},inplace=True)

In [34]:
train['membership_category'].replace({'No Membership':0,'Basic Membership':1,'Silver Membership':2,
                                      'Gold Membership':3,'Platinum Membership':4,'Premium Membership':5}
                                     ,inplace=True)

In [35]:
test['internet_option'].replace({'Wi-Fi':2,'Mobile_Data':1,'Fiber_Optic':3},inplace=True)

In [36]:
test['membership_category'].replace({'No Membership':0,'Basic Membership':1,'Silver Membership':2,
                                      'Gold Membership':3,'Platinum Membership':4,'Premium Membership':5}
                                   ,inplace=True)

### tranformation

In [37]:
y=train['churn_risk_score']
train_v1=train.drop(columns='churn_risk_score')

In [38]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [39]:
ohe=OneHotEncoder()

In [40]:
cat=[]
for i in test.columns:
    if test[i].dtype=='O':
        cat.append(i)

In [41]:
ct=ColumnTransformer([("ohe",ohe,cat)],
                    remainder='passthrough',
                    n_jobs=-1)

In [42]:
train_arr=ct.fit_transform(train_v1)

In [43]:
test_arr=ct.transform(test)

In [44]:
train_arr.shape, test_arr.shape, len(y)

((35829, 49), (19919, 49), 35829)

### FINALLY MODEL

In [45]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

In [46]:
rfc=RandomForestClassifier(n_jobs=-1,random_state=42)

In [47]:
lgbm=LGBMClassifier(random_state=42,n_jobs=-1)

In [48]:
xgb=XGBClassifier(random_state=42,n_jobs=-1)

### cross_validation_f1_score

In [49]:
def make_scorer(estimator,X,y):
    y_pred=estimator.predict(X)
    return f1_score(y,y_pred,average="macro")

In [50]:
cross_val_score(rfc,train_arr,y,cv=5,scoring=make_scorer).mean()

0.7646810522794987

In [51]:
cross_val_score(xgb,train_arr,y,cv=5,scoring=make_scorer).mean()

0.7668140064587012

In [52]:
cross_val_score(lgbm,train_arr,y,cv=5,scoring=make_scorer).mean()

0.766260632919091

In [53]:
# xgb.fit(train_arr,y)
# xgb_vals=xgb.predict(test_arr)
# res_xgb_2=pd.DataFrame({"customer_id":test_ids,"churn_risk_score":xgb_vals})
# res_xgb_2.to_csv('res_xgb_2.csv',index=False)

In [54]:
# lgbm.fit(train_arr,y)
# lgbm_vals=lgbm.predict(test_arr)
# res_lgbm_2=pd.DataFrame({"customer_id":test_ids,"churn_risk_score":lgbm_vals})
# res_lgbm_2.to_csv('res_lgbm_2.csv',index=False)

In [55]:
# rfc.fit(train_arr,y)
# rfc_vals=rfc.predict(test_arr)
# res_rfc_3=pd.DataFrame({"customer_id":test_ids,"churn_risk_score":rfc_vals})
# res_rfc_3.to_csv('res_rfc_3.csv',index=False)

### GRID SEARCH OF XGB

In [61]:
grid_xgb={
    'max_depth':[6,8,10],
    'learning_rate':[0.01,0.03],
    'booster':['gbtree','dart']
}

xgb_g=GridSearchCV(estimator=xgb,
                  param_grid=grid_xgb,
                  cv=5,
                  verbose=2,
                  n_jobs=-1)

In [62]:
xgb_g.fit(train_arr,y)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 14.5min finished


GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=-1,
                                     num_parallel_tree=None, random_state=42,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, val

In [63]:
xgb_g.best_params_

{'booster': 'gbtree', 'learning_rate': 0.03, 'max_depth': 6}

In [65]:
xgb_best=XGBClassifier(learning_rate=0.03,n_jobs=-1,random_state=42)

In [66]:
cross_val_score(xgb_best,train_arr,y,cv=5,scoring=make_scorer).mean()

0.7742482216274017

## Solution CSV

In [67]:
xgb_best.fit(train_arr,y)
xgb_vals=xgb_best.predict(test_arr)
res_xgb_2=pd.DataFrame({"customer_id":test_ids,"churn_risk_score":xgb_vals})
res_xgb_2.to_csv('res_xgb_3.csv',index=False)

### GRID SEARCH OF LGBM

In [64]:
# grid={
#     'boosting_type':['gbdt','rf','dart'],
#     'n_estimators':[100,140,180,220,260],
#     'learning_rate':[0.01,0.03,0.1],
#     'num_leaves':[31,40,60,80,100]
# }

# lgbm_g=GridSearchCV(estimator=lgbm,
#                     param_grid=grid,
#                     scoring=make_scorer,
#                     n_jobs=-1,
#                     cv=5,
#                     verbose=2
#                    )

In [65]:
# lgbm_g.fit(train_arr,y)

Fitting 5 folds for each of 225 candidates, totalling 1125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 22.3min
[Parallel(n_jobs=-1)]: Done 1125 out of 1125 | elapsed: 31.2min finished


GridSearchCV(cv=5, estimator=LGBMClassifier(random_state=42), n_jobs=-1,
             param_grid={'boosting_type': ['gbdt', 'rf', 'dart'],
                         'learning_rate': [0.01, 0.03, 0.1],
                         'n_estimators': [100, 140, 180, 220, 260],
                         'num_leaves': [31, 40, 60, 80, 100]},
             scoring=<function make_scorer at 0x7fa19d56aa60>, verbose=2)

In [66]:
# lgbm_g.best_params_

{'boosting_type': 'dart',
 'learning_rate': 0.1,
 'n_estimators': 260,
 'num_leaves': 60}