In [None]:
import pandas as pd 
import numpy as np
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

In [None]:
%matplotlib inline
sns.set(color_codes=True)
pal = sns.color_palette("Set2", 10)
sns.set_palette(pal)

In [None]:
train = pd.read_csv('../input/hackerearth-how-not-to-lose-a-customer-in-10-days/train.csv')
test = pd.read_csv('../input/hackerearth-how-not-to-lose-a-customer-in-10-days/test.csv')

In [None]:
train.shape

# EDA/ DATA PREP

In [None]:
train.info()

In [None]:
train.isnull().sum()

In [None]:
for col in train.columns:
    if col not in ['customer_id','Name',  'security_no','referral_id','last_visit_time','joining_date','avg_frequency_login_days'] and train[str(col)].dtype != 'float64':
        x = train.groupby(str(col))[str(col)].count().sort_values(ascending=False)
        df = pd.DataFrame({str(col):x.index,'count':x.values})
        
        print(df.to_string(index=False))
        print(".............................")

In [None]:
for col in test.columns:
    if col not in ['customer_id','Name',  'security_no','referral_id','last_visit_time','joining_date','avg_frequency_login_days'] and train[str(col)].dtype != 'float64':
        x = test.groupby(str(col))[str(col)].count().sort_values(ascending=False)
        df = pd.DataFrame({str(col):x.index,'count':x.values})
        
        print(df.to_string(index=False))
        print(".............................")

## Handling Garbge Values

In [None]:
## Dropping rows with -1 churn risk

l=[]
for i in range(train.shape[0]):
    if(train['churn_risk_score'][i]==-1):
        l.append(i)
        
train = train.drop(l,axis=0)


In [None]:
ab = train[train['avg_frequency_login_days']!='Error']
ab['avg_frequency_login_days'] = ab['avg_frequency_login_days'].astype('float64')
posmean = ab[ab['avg_frequency_login_days']>0]['avg_frequency_login_days'].mean()

In [None]:
sns.displot(ab, x="avg_frequency_login_days", kind="kde",hue='churn_risk_score')

In [None]:
## Handling avg_frequency_login_days error value and negative values

l = []
for i in train['avg_frequency_login_days']:
    if i =='Error' or '-' in str(i):
        l.append(posmean)
    else:
        l.append(i)
        
train['avg_frequency_login_days'] = l
train['avg_frequency_login_days'] = train['avg_frequency_login_days'].astype('float64')


l = []
for i in test['avg_frequency_login_days']:
    if i =='Error' or '-' in str(i):
        l.append(posmean)
    else:
        l.append(i)
        
test['avg_frequency_login_days'] = l
test['avg_frequency_login_days'] = test['avg_frequency_login_days'].astype('float64')

In [None]:
sns.displot(train, x="avg_frequency_login_days", kind="kde",hue='churn_risk_score')

In [None]:
## With whole Data

sns.displot(train, x="days_since_last_login", kind="kde",hue='churn_risk_score')

In [None]:
#Without Garbage Value


dsllp = train[train['days_since_last_login']>0]
sns.displot(dsllp, x="days_since_last_login", kind="kde",hue='churn_risk_score')

In [None]:
mu = np.mean(dsllp['days_since_last_login'])
sigma = np.std(dsllp['days_since_last_login'])
np.random.seed(13)
s = np.random.normal(mu, sigma, 100000).astype('int64')

### Handling days_since_last_login -999 values, we will NORMALLY distribute over all values

In [None]:
## Handling days_since_last_login -999


import random
np.random.seed(13)
l = []
for i in train['days_since_last_login']:
    if i == -999:
        num =0
        f = True
        while f:
            num =random.choice(s)
            if num>0 and num<=26:
                l.append(num)
                f=False
    else:
        l.append(i)
        
train['days_since_last_login'] = l



l = []
for i in test['days_since_last_login']:
    if i == -999:
        num =0
        f = True
        while f:
            num =random.choice(s)
            if num>0 and num<=26:
                l.append(num)
                f=False
    else:
        l.append(i)
        
test['days_since_last_login'] = l

In [None]:
# After Handling Garbage Values

sns.displot(train, x="days_since_last_login", kind="kde",hue='churn_risk_score')

WE HAVE SUCCESSFULLY SYNTHESISED THE -999 DATA

## Handling Missing values

In [None]:
## For Points in Wallet

### Before Handling missing values

sns.displot(train, x="points_in_wallet", kind="kde",hue='churn_risk_score')

Less churn risk score tend to have more points

In [None]:
## Checking various distributions for selecting Imputation method

print('-------------------------------')
for i in range(0,1400,100):
    print('From '+str(i)+' to '+str(i+100))
    print('Total no. of observations:', train[(train['points_in_wallet']>i)&(train['points_in_wallet']<i+100)].shape[0])
    a = train[(train['points_in_wallet']>i)&(train['points_in_wallet']<i+100)].groupby('churn_risk_score').churn_risk_score.count()
    b = pd.DataFrame({'Churn Risk Score': a.index, '%age':a.values*100/a.values.sum()})
    print(b.to_string(index=False))
    
    print('-------------------------------')

In [None]:
print('For Missing Values')
print('Total no. of observations:', train[(train['points_in_wallet'].isnull())|(train['points_in_wallet']<0)].shape[0])
a = train[(train['points_in_wallet'].isnull())|(train['points_in_wallet']<0)].groupby('churn_risk_score').churn_risk_score.count()
b =  pd.DataFrame({'Churn Risk Score': a.index, '%age':a.values*100/a.values.sum()})
print(b.to_string(index=False))

Missing values match with data that is less than 0, Its Possible that missing values must be 0 points 

In [None]:
## imputing 0 

l = []
np.random.seed(13)
r= np.random.uniform(500,1000,100000)
s = set(train['points_in_wallet'])
for i in train['points_in_wallet']:
    if (i not in s) or (i<0) :
        l.append(random.choice(r))
    else:
        l.append(i)
train['points_in_wallet'] = l


l = []
s = set(test['points_in_wallet'])
for i in test['points_in_wallet']:
    if (i not in s) or i<0 :
        l.append(random.choice(r))
    else:
        l.append(i)
test['points_in_wallet'] = l

In [None]:
## After Handling Missing Values

sns.displot(train, x="points_in_wallet", kind="kde",hue='churn_risk_score')


In [None]:
## Handling Region Category
s = ['Town','City','Village']
for i in s: 
    a = train[train['region_category']==i].groupby('churn_risk_score').churn_risk_score.count()
    b =  pd.DataFrame({'Churn Risk Score': a.index, '%age':a.values*100/a.values.sum()})
    print('For ', i)
    print('Total no. of observations : ', train[train['region_category']==i].shape[0])
    print(b.to_string(index=False))
    print('-------------------------------')

In [None]:
a = train[train['region_category'].isnull()].groupby('churn_risk_score').churn_risk_score.count()
b =  pd.DataFrame({'Churn Risk Score': a.index, '%age':a.values*100/a.values.sum()})
print('For missing values')
print(b.to_string(index=False))

### It matches with town and city data, therefore we will go with most frequent that is Town

In [None]:
## Imputing Town

l = []
for i in train['region_category']:
    if i in [np.nan]:
        l.append('Town')
    else:
        l.append(i)

train['region_category'] = l


l = []
for i in test['region_category']:
    if i in [np.nan]:
        l.append('Town')
    else:
        l.append(i)

test['region_category'] = l



In [None]:
s = set(train['preferred_offer_types'])
for i in s: 
    if i not in [np.nan]:
        a = train[train['preferred_offer_types']==i].groupby('churn_risk_score').churn_risk_score.count()
        b =  pd.DataFrame({'Churn Risk Score': a.index, '%age':a.values*100/a.values.sum()})
        print('For ', i)
        print('Total no. of observations : ', train[train['preferred_offer_types']==i].shape[0])
        print(b.to_string(index=False))
        print('-------------------------------')
    else:
        a = train[train['preferred_offer_types'].isnull()].groupby('churn_risk_score').churn_risk_score.count()
        b =  pd.DataFrame({'Churn Risk Score': a.index, '%age':a.values*100/a.values.sum()})
        print('For ', i)
        print('Total no. of observations : ', train[train['preferred_offer_types'].isnull()].shape[0])
        print(b.to_string(index=False))
        print('-------------------------------')

### Missing Data more similar to Without offer and it makes sense also!!

In [None]:
## Imputing Without Offers

l = []
for i in train['preferred_offer_types']:
    if i in [np.nan]:
        l.append('Without Offers')
    else:
        l.append(i)

train['preferred_offer_types'] = l


l = []
for i in test['preferred_offer_types']:
    if i in [np.nan]:
        l.append('Without Offers')
    else:
        l.append(i)

test['preferred_offer_types'] = l

In [None]:
a = train[train['preferred_offer_types']=='Without Offers'].groupby('churn_risk_score').churn_risk_score.count()
b =  pd.DataFrame({'Churn Risk Score': a.index, '%age':a.values*100/a.values.sum()})
print('For ', i)
print('Total no. of observations : ', train[train['preferred_offer_types']=='Without Offers'].shape[0])
print(b.to_string(index=False))
print('-------------------------------')

### Perfectly Synthesised Missing Values

## There are some values with '?' also, Lets Handle them

In [None]:
for col in train.columns:
    if '?' in set(train[str(col)]):
        print(col)

In [None]:
s = set(train['joined_through_referral'])
for i in s: 
    a = train[train['joined_through_referral']==i].groupby('churn_risk_score').churn_risk_score.count()
    b =  pd.DataFrame({'Churn Risk Score': a.index, '%age':a.values*100/a.values.sum()})
    print('For ', i)
    print('Total no. of observations : ', train[train['joined_through_referral']==i].shape[0])
    print(b.to_string(index=False))
    print('-------------------------------')

### Missing Data is simillar to Yes and No both, but more like Yes, So not deisturbing the balance Lets impute YES in probability of 0.66 randomly

In [None]:
# Imputing Yes

l = []
np.random.seed(13)
r = ['Yes','Yes','No']
for i in train['joined_through_referral']:
    if i in ['?']:
        l.append(random.choice(r))
    else:
        l.append(i)

train['joined_through_referral'] = l


l = []
for i in test['joined_through_referral']:
    if i in ['?']:
        l.append(random.choice(r))
    else:
        l.append(i)

test['joined_through_referral'] = l

In [None]:
## For medium of operation

s = set(train['medium_of_operation'])
for i in s: 
    a = train[train['medium_of_operation']==i].groupby('churn_risk_score').churn_risk_score.count()
    b =  pd.DataFrame({'Churn Risk Score': a.index, '%age':a.values*100/a.values.sum()})
    print('For ', i)
    print('Total no. of observations : ', train[train['medium_of_operation']==i].shape[0])
    print(b.to_string(index=False))
    print('-------------------------------')

### Data doesnot match any feature specifically, so we can assume this must be an other device like Tablet/Ipad or Laptop etc.

In [None]:
l = []
for i in train['medium_of_operation']:
    if i in ['?']:
        l.append('Laptop')
    else:
        l.append(i)

train['medium_of_operation'] = l

l = []
for i in test['medium_of_operation']:
    if i in ['?']:
        l.append('Laptop')
    else:
        l.append(i)

test['medium_of_operation'] = l

## Handling Date column

In [None]:
train['joining_date'] =  pd.to_datetime(train['joining_date'], format='%Y-%m-%d')
test['joining_date'] =  pd.to_datetime(test['joining_date'], format='%Y-%m-%d')

In [None]:
days = []
months = []
years = []
for i in train['joining_date']:
    days.append(i.day)
    months.append(i.month)
    years.append(i.year)

train['Day'] = days
train['Month'] = months
train['Year'] = years



days = []
months = []
years = []
for i in test['joining_date']:
    days.append(i.day)
    months.append(i.month)
    years.append(i.year)

test['Day'] = days
test['Month'] = months
test['Year'] = years

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

## Data Prep done, now we will continue with Model Building

In [None]:
X = train.drop(['customer_id','Name','security_no','churn_risk_score','joining_date','referral_id','last_visit_time'],axis=1)
y = train['churn_risk_score']
X_test = test.drop(['customer_id','Name','security_no','joining_date','referral_id','last_visit_time'],axis=1)

In [None]:
cat_cols1 = [col for col in X.columns if X[str(col)].dtype=='object']

In [None]:
## region_category
d1 ={'Village':0, 'Town':1, 'City':2}
l1=[]
l2=[]
for i in X['region_category']:
    l1.append(d1[i])
X['region_category']= l1

for i in X_test['region_category']:
    l2.append(d1[i])
X_test['region_category'] = l2



## membership_category
d1 ={'No Membership':0, 'Basic Membership':1, 'Premium Membership':2, 'Silver Membership':3,'Gold Membership':4,'Platinum Membership':5  }
l1=[]
l2=[]
for i in X['membership_category']:
    l1.append(d1[i])
X['membership_category']= l1

for i in X_test['membership_category']:
    l2.append(d1[i])
X_test['membership_category']= l2



##internet_options
d1 ={'Without Offers':0, 'Credit/Debit Card Offers':1, 'Gift Vouchers/Coupons':2}
l1=[]
l2=[]
for i in X['preferred_offer_types']:
    l1.append(d1[i])
X['preferred_offer_types']= l1

for i in X_test['preferred_offer_types']:
    l2.append(d1[i])
X_test['preferred_offer_types'] = l2



##internet_options
d1 ={'Mobile_Data':0, 'Wi-Fi':1, 'Fiber_Optic':2}
l1=[]
l2=[]
for i in X['internet_option']:
    l1.append(d1[i])
X['internet_option']= l1

for i in X_test['internet_option']:
    l2.append(d1[i])
X_test['internet_option'] = l2




##complaint_status
d1 ={'Unsolved':0, 'Not Applicable':1, 'No Information Available':2, 'Solved in Follow-up':3, 'Solved':4}
l1=[]
l2=[]
for i in X['complaint_status']:
    l1.append(d1[i])
X['complaint_status']= l1

for i in X_test['complaint_status']:
    l2.append(d1[i])
X_test['complaint_status'] = l2



##feedback
d1 ={'Reasonable Price':1, 'Quality Customer Care':1, 'Too many ads':0, 'User Friendly Website':1, 'Poor Customer Service':0, 'No reason specified':0, 'Products always in Stock':1, 'Poor Website':0, 'Poor Product Quality':0 }
l1=[]
l2=[]
for i in X['feedback']:
    l1.append(d1[i])
X['feedback']= l1

for i in X_test['feedback']:
    l2.append(d1[i])
X_test['feedback'] = l2


In [None]:
cat_cols = [col for col in X.columns if X[str(col)].dtype=='object']
num_cols = [col for col in X.columns if X[str(col)].dtype!='object']

In [None]:
cat_cols_n = []
i = 0
for col in X.columns:
    if(X[str(col)].dtype=='object'):
        cat_cols_n.append(i)
    i = i+1

In [None]:
le = LabelEncoder()
for col in cat_cols:
    X[str(col)] = le.fit_transform(X[str(col)])
    X_test[str(col)] = le.transform(X_test[str(col)])

In [None]:
X

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(X.corr(),annot=True,vmin=-1,vmax=1,cmap='coolwarm')

In [None]:
from sklearn.feature_selection import mutual_info_classif
plt.figure(figsize=(10,10))
imp = mutual_info_classif(X,y)
feat_imp = pd.Series(imp,X.columns)
feat_imp.plot(kind='barh', color='pink')
plt.show()

In [None]:
len(X.columns)

In [None]:
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.2,random_state=42)

## Baseline Model

In [None]:
lg = LGBMClassifier()
lg.fit(X_train,y_train)
y_pred_l = lg.predict(X_dev)
f1_score(y_dev,y_pred_l,average='macro')

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred_r = rf.predict(X_dev)
f1_score(y_dev,y_pred_r,average='macro')

In [None]:
xg = XGBClassifier(objective = 'multi:softprob')
xg.fit(X_train,y_train)
y_pred_x = xg.predict(X_dev)
f1_score(y_dev,y_pred_x,average='macro')

## Feature Selection (Forward Selection)

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector
ffs =SequentialFeatureSelector(lg,k_features='best',forward=True, n_jobs=-1)
ffs.fit(X_train,y_train)
features = list(ffs.k_feature_names_)

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector
ffs =SequentialFeatureSelector(rf,k_features='best',forward=True, n_jobs=-1)
ffs.fit(X_train,y_train)
features2 = list(ffs.k_feature_names_)

In [None]:
print(features)

In [None]:
print(features2)

In [None]:
lg.fit(X_train[features],y_train)
y_pred_l = lg.predict(X_dev[features])
f1_score(y_dev,y_pred_l,average='macro')

In [None]:
rf.fit(X_train[features2],y_train)
y_pred_r = rf.predict(X_dev[features2])
f1_score(y_dev,y_pred_r,average='macro')

In [None]:
df = pd.DataFrame({'customer_id':test['customer_id'],'churn_risk_score':lg.predict(X_test[features])})
df.to_csv('submit.csv',index=False)

In [None]:
df2 = pd.DataFrame({'customer_id':test['customer_id'],'churn_risk_score':xg.predict(X_test)})
df2.to_csv('submit2.csv',index=False)

In [None]:
df3 = pd.DataFrame({'customer_id':test['customer_id'],'churn_risk_score':rf.predict(X_test[features2])})
df3.to_csv('submit3.csv',index=False)

In [None]:
df4 = pd.DataFrame({'customer_id':test['customer_id'],'churn_risk_score':(df['churn_risk_score']+df3['churn_risk_score'])//2})
df4.to_csv('submit4.csv',index=False)

## Will Keep updating, Stay tuned 