Churn - it is most commonly expressed as the percentage of service subscribers who discontinue their subscriptions within a given time period

customer_id	- Represents the unique identification number of a customer

Name - Represents the name of a customer

age - Represents the age of a customer

security_no	- Represents a unique security number that is used to identify a person

region_category	- Represents the region that a customer belongs to 

membership_category -	Represents the category of the membership that a customer is using

joining_date -	Represents the date when a customer became a member 

joined_through_referral	- Represents whether a customer joined using any referral code or ID

referral_id -	Represents a referral ID

preferred_offer_types -	Represents the type of offer that a customer prefers

medium_of_operation -	Represents the medium of operation that a customer uses for transactions

internet_option -	Represents the type of internet service a customer uses

last_visit_time -	Represents the last time a customer visited the website

days_since_last_login -	Represents the no. of days since a customer last logged into the website

avg_time_spent -	Represents the average time spent by a customer on the website

avg_transaction_value -	Represents the average transaction value of a customer

avg_frequency_login_days -	Represents the no. of times a customer has logged in to the website

points_in_wallet -	Represents the points awarded to a customer on each transaction 

used_special_discount -	Represents whether a customer uses special discounts offered

offer_application_preference - 	Represents whether a customer prefers offers 

past_complaint - Represents whether a customer has raised any complaints 

complaint_status - 	Represents whether the complaints raised by a customer was resolved 

feedback - 	Represents the feedback provided by a customer

churn_risk_score -	Represents the churn risk score that ranges from 1 to 5

In [None]:
#pip install lux-api

In [None]:
#pip install pandasgui

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#import lux

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.pandas.set_option('display.max_columns', 100)

In [None]:
train_data = pd.read_csv('../input/churn-risk-rate-hackerearth-ml/train.csv')
train_data

In [None]:
test_data = pd.read_csv('../input/churn-risk-rate-hackerearth-ml/test.csv')
test_data

In [None]:
test_customer_id = test_data['customer_id']

In [None]:
#Try
train_data['churn_risk_score'].isnull().count()

In [None]:
#Concatinating train and test data
# Train = 1 i.e Belongs to training data and 0 belongs to test data
train_data['Train'] = 1
test_data['Train'] = 0

In [None]:
new_data = pd.concat([train_data, test_data]).reset_index(drop=True)

In [None]:
new_data

In [None]:
new_data['customer_id'].value_counts() # total unique values

In [None]:
#Dropping 'customer_id' colummn as it has all unique values
new_data.drop('customer_id', axis = 1, inplace = True)

In [None]:
new_data.columns

In [None]:
new_data.head()

In [None]:
new_data['security_no'].value_counts() # total unique values

In [None]:
#Dropping 'security_no' colummn as it has all unique values
new_data.drop('security_no', axis = 1, inplace = True)

In [None]:
new_data.head(2)

In [None]:
new_data['Name'].value_counts() # total unique values

In [None]:
#Dropping 'Name' colummn as it has all unique values
new_data.drop('Name', axis = 1, inplace = True)

In [None]:
new_data.info()

In [None]:
new_data['age'].value_counts() # total unique values

In [None]:
plt.figure(figsize = (20,5))
sns.countplot(new_data['age'])

In [None]:
plt.figure(figsize = (20,5))
sns.countplot(new_data['age'], hue = new_data['Train'], )
#In training data, the count of different age groups are almost same i.e between 600-700
#  whereas, in testing data the count is between  300-400

In [None]:
new_data['churn_risk_score'].value_counts()

In [None]:
new_data['churn_risk_score'] = new_data['churn_risk_score'].replace(-1,1)

In [None]:
new_data['churn_risk_score'].value_counts()

In [None]:
plt.figure(figsize = (20,5))
sns.countplot(new_data['churn_risk_score'], hue = new_data['Train'])
# for the training data, majority customers has churn_risk_score 3 or greater than 3, which is bad
# Training data also has an outlier of churn_risk_score = -1

In [None]:
sns.countplot('gender', data = new_data)
# The count for no of males and females is almost same. There are very few whose gender is unknown

In [None]:
sns.countplot('gender',hue = 'churn_risk_score', data = new_data)
# For Training Data as Test Data has no churn_risk_score column

In [None]:
new_data['churn_risk_score'].value_counts()
# From this, we can make an assumption that, churn_rate_score > 3 means that there are very high chances of that customer
# unsubscribing from there services

In [None]:
sns.countplot(train_data['region_category'], hue = train_data['churn_risk_score']>3)
# In village region - around 2K out of 4700 people have churn rate > 3 i.e is around 43%
# In City region - around 7K out of 12700 have churn rate > 3 i.e is around 55%
# In Town Region - around 7500 out of 14128 have churn rate > 3 i.e is around 53%

In [None]:
train_data['region_category'].value_counts() # In training Data

In [None]:
new_data['membership_category'].value_counts()

In [None]:
sns.countplot(train_data['membership_category'])
plt.xticks(rotation = 90)
plt.show()

In [None]:
plt.figure(figsize = (8,8))
train_data['membership_category'].value_counts().plot.pie(autopct='%1.1f%%')
# Around 23.7% People come in the category having Platinum Memebership and Premiuim Membership
# Around 41.7% people come in the category having basic or No Memebership
# Around 34.6% people come ine the category having Gold or Silver Memebership

In [None]:
plt.figure(figsize = (10,5))
sns.countplot(new_data['membership_category'], hue = new_data['churn_risk_score'])
plt.xticks(rotation = 90)
plt.show()

#### Customers having No Membership or Basic Membership have HIGH churn_risk_score of 5

#### Customers having Platinum or Premium Membership have churn_risk_score 3 or less than 3

#### Customers having Gold or Silver Membership have churn_risk_score 3 or 4

#### From these observation, we can conclude that customers having No Membership or Basic Membership have very high chances of unsubscribing the services. Whereas, Customers having Platinum or Premium Membership have very low chances of unsubscribing the services. 

In [None]:
new_data.head()

In [None]:
new_data['joined_through_referral'].value_counts()

#### 'joined_through_referral' has some missing values('?'), so we will fill it with yes or no depending on whether they have a 'referral_id ' or not

In [None]:
train_data['referral_id'].value_counts()

In [None]:
# replacing xxxxxxxx by np.nan values
#new_data['referral_id'] = new_data['referral_id'].replace('xxxxxxxx', np.nan)

In [None]:
new_data.head()

In [None]:
new_data

#### Replacing the '?' in 'joined_through_referral' by checking if it has a 'referral_id' or not.
#### If a customer had a 'referral_id' then we replace '?' in 'joined_through_referral' by a 'Yes'

In [None]:
new_data['joined_through_referral'].value_counts()

In [None]:
new_data['referral_id'].value_counts()

In [None]:
for i, j in zip(new_data.joined_through_referral, new_data.referral_id):
  #print(i, j)
  if (i == '?') and (j != 'xxxxxxxx'):
           new_data['joined_through_referral'] = new_data['joined_through_referral'].replace('?', 'Yes')
  else:
        pass

In [None]:
 new_data.head()

In [None]:
new_data['joined_through_referral'].value_counts()
# now, we don't have any outlier in 'joined_through_referral' column.

In [None]:
new_data['joined_through_referral'].value_counts().plot.pie(autopct='%1.1f%%')
# About 57.6% people have joined through a referral.

In [None]:
new_data.head()

In [None]:
new_data['referral_id'].value_counts()

In [None]:
new_data.drop('referral_id', axis = 1, inplace = True)
# dropping 'referral_id' column has it had a lot of missing values and we anyway have 'joined_through_referral' column to
# give details, like whether the cutomer joined through a referral or not.

In [None]:
new_data['preferred_offer_types'].value_counts()

In [None]:
new_data['preferred_offer_types'].value_counts().plot.pie(autopct='%1.1f%%')
# There is equal percentages of customers enjoying different offers.

In [None]:
plt.figure(figsize = (15, 5))
sns.countplot(train_data['preferred_offer_types'], hue = train_data['churn_risk_score'])
# Customers without offers have a slightly higher chances of having 'churn_risk_score' >=3

In [None]:
new_data['region_category'].value_counts().plot.pie(autopct='%1.1f%%')
# Around 45% subscribers are from Town, 40% from City, and, 15% fom Village

In [None]:
plt.figure(figsize = (15,5))
sns.countplot(new_data['membership_category'], hue = new_data['region_category'])
# In every 'membership_category', majority subscribers are from Town and City.

In [None]:
print(new_data['medium_of_operation'].value_counts())
print('\n internet_option: \n',new_data['internet_option'].value_counts())

In [None]:
new_data['medium_of_operation'].value_counts().keys()

In [None]:
medium_of_operation_list = new_data['medium_of_operation'].value_counts().keys()
medium_of_operation_list

In [None]:
new_data.head(30)

In [None]:
new_data['internet_option'].value_counts().keys()

In [None]:

wifi = []
fibrop = []
mobiledt = []

for i,j in zip(new_data.medium_of_operation, new_data.internet_option):
        #print(i,j)
        if j == 'Wi-Fi':
            wifi.append(i)
        elif j == 'Fiber_Optic':
            fibrop.append(i)
        elif j == 'Mobile_Data':
            mobiledt.append(i)
        else:
            pass

In [None]:
def count_fun(lst, name):
    return lst.count(name)

In [None]:
# ['Desktop', 'Smartphone', '?', 'Both']
lst = wifi
print('wifi Users : \n')
name = 'Desktop'
print('{} has occurred {} times'.format(name, count_fun(lst, name))) 

name = 'Smartphone'
print('{} has occurred {} times'.format(name, count_fun(lst, name))) 

name = '?'
print('{} has occurred {} times'.format(name, count_fun(lst, name))) 

name = 'Both'
print('{} has occurred {} times'.format(name, count_fun(lst, name))) 

# Mobile Data
lst = mobiledt
print('\nMobile Data Users : \n')
name = 'Desktop'
print('{} has occurred {} times'.format(name, count_fun(lst, name))) 

name = 'Smartphone'
print('{} has occurred {} times'.format(name, count_fun(lst, name))) 

name = '?'
print('{} has occurred {} times'.format(name, count_fun(lst, name))) 

name = 'Both'
print('{} has occurred {} times'.format(name, count_fun(lst, name))) 

# Fibre Optics
lst = fibrop
print('\nFibre Optics Users : \n')
name = 'Desktop'
print('{} has occurred {} times'.format(name, count_fun(lst, name))) 

name = 'Smartphone'
print('{} has occurred {} times'.format(name, count_fun(lst, name))) 

name = '?'
print('{} has occurred {} times'.format(name, count_fun(lst, name))) 

name = 'Both'
print('{} has occurred {} times'.format(name, count_fun(lst, name))) 

#### For each 'internet_option' we checked which type of 'medium_of_operation' is mostly used.
#### For customers, whose 'medium_of_operation' = '?', we will replace '?' with the type of internet option mostly used.

In [None]:
# medium_of_operation -  ['Desktop', 'Smartphone', '?', 'Both']
# internet_option - ['Wi-Fi', 'Fiber_Optic', 'Mobile_Data']

#DataFrame['column_name'] = numpy.where(condition, new_value, DataFrame.column_name)

#### Majority People having 'internet_option'  = 'Wi-Fi' has 'medium_of_operation' = 'Desktop', so replacing '?' by 'Desktop' in the Table
#### Majority People having 'internet_option'  = Fiber_Optic' has 'medium_of_operation' = 'Smartphone', so replacing '?' by 'Smartphone' in the Table
#### Majority People having 'internet_option'  = 'Mobile_Data' has 'medium_of_operation' = 'Smartphone', so replacing '?' by 'Smartphone' in the Table

In [None]:
sns.countplot(new_data['internet_option'], hue = new_data['medium_of_operation'])

In [None]:
new_data[(new_data['medium_of_operation'] == '?') & (new_data['internet_option'] == 'Wi-Fi')].shape

In [None]:
condt = (new_data['medium_of_operation'] == '?') & (new_data['internet_option'] == 'Wi-Fi')
new_data['medium_of_operation'] = np.where((condt), 'Desktop', new_data['medium_of_operation'])

condt = (new_data['medium_of_operation'] == '?') & (new_data['internet_option'] == 'Fiber_Optic')
new_data['medium_of_operation'] = np.where((condt), 'Smartphone', new_data['medium_of_operation'])

condt = (new_data['medium_of_operation'] == '?') & (new_data['internet_option'] == 'Mobile_Data')
new_data['medium_of_operation'] = np.where((condt), 'Smartphone', new_data['medium_of_operation'])

In [None]:
new_data['medium_of_operation'].value_counts()

In [None]:
new_data.head(20)

In [None]:
new_data['medium_of_operation'].value_counts().plot.pie(autopct='%1.1f%%')

#### Approx 47.2% of the subscribers have 'medium_of_operation' == Smartphones
#### Approx 42.5% of the subscribers have 'medium_of_operation' == Desktop
#### Approx 10.4% of the subscribers have 'medium_of_operation' == Both

In [None]:
new_data['churn_risk_score'].value_counts().plot.pie(autopct='%1.1f%%')

In [None]:
sns.countplot(new_data['medium_of_operation'], hue = new_data['churn_risk_score'])

In [None]:
new_data['internet_option'].value_counts().plot.pie(autopct='%1.1f%%')

#### Equal % of people have internet options as Wi-Fi, Fiber Optics and Mobile Data

In [None]:
sns.countplot(new_data['membership_category'], hue = new_data['medium_of_operation'])
plt.xticks(rotation = 90)
plt.show()

In [None]:
new_data.head(30)

In [None]:
new_data['avg_time_spent'].isnull().any()

In [None]:
plt.figure(figsize = (15,5))
sns.distplot(new_data['avg_time_spent'])
plt.xlim(-1000,1000)

In [None]:
new_data.loc[new_data['avg_time_spent'] < 0].shape

#### There are 2650 rows, having 'avg_time_spent' < 0

In [None]:
new_data.loc[new_data['avg_time_spent'] < 0]

In [None]:
new_data['avg_time_spent'].dtype

In [None]:
for val in new_data['avg_time_spent']:
    time = str (val)
    if '-' in  time:
        splt = time.split('-')
        #print(splt[1])
        res = splt[1]
        print( float (res))
        #print(new_data['avg_time_spent'][val])
    

In [None]:
def change_sign(val):
    time = str (val)
    if '-' in  time:
        splt = time.split('-')
        res = splt[1]
        #print(res)
        #print(new_data['avg_time_spent'][val])
        return float(res)
    else:
        return val

In [None]:
new_data['avg_time_spent'] = new_data['avg_time_spent'].apply(change_sign)

In [None]:
new_data['avg_time_spent'].dtype

In [None]:
new_data.loc[new_data['avg_time_spent'] < 0].shape
# all the negative values are handled

In [None]:
plt.figure(figsize = (15,5))
sns.distplot(new_data['avg_time_spent'])
plt.xlim(-1000,1000)

In [None]:
new_data.head(30)

#### As we can see that, there are negative values , which cannot be possible as time spent cannot be in negative - OUTLIERS
#### Either we can drop the columns having negative values or Remove the '-' sign

In [None]:
new_data['days_since_last_login'].isnull().any()

In [None]:
plt.figure(figsize = (15,5))
sns.distplot(new_data['days_since_last_login'])
plt.xlim(-20,30)

#### As we can see that, there are negative values , which cannot be possible as days cannot be in negative - OUTLIERS
#### Maximum people have 'days_since_last_login' between 5-25 days

In [None]:
new_data.loc[new_data['days_since_last_login'] < 0].shape

In [None]:
test_data.loc[new_data['days_since_last_login'] < 0].shape

In [None]:
#new_data['days_since_last_login'].mean()
#condt = (new_data['medium_of_operation'] == '?') & (new_data['internet_option'] == 'Mobile_Data')
#new_data['medium_of_operation'] = np.where((condt), 'Smartphone', new_data['medium_of_operation'])

#### There are 3021 rows, having 'days_since_last_login' < 0

In [None]:
condt = new_data['days_since_last_login'] < 0
condt

In [None]:
new_data['days_since_last_login'] = np.where((condt), np.nan, new_data['days_since_last_login'])

In [None]:
new_data.loc[new_data['days_since_last_login'] < 0].shape

In [None]:
test_data.loc[new_data['days_since_last_login'] < 0].shape

In [None]:
loc = new_data.loc[new_data['days_since_last_login'] < 0]

In [None]:
# df.drop(df[df['Age'] < 25].index, inplace = True)
#new_data = new_data.drop(new_data.loc[new_data['days_since_last_login'] < 0].index)

In [None]:
new_data.shape

In [None]:
new_data.info()

In [None]:
plt.figure(figsize = (10,5))
sns.distplot(new_data['days_since_last_login'])
plt.xlim(-10,30)

In [None]:
new_data.head()

In [None]:
plt.figure(figsize = (15,5))
sns.distplot(new_data['avg_transaction_value'])

#### Majority 'avg_transaction_value' is between 1k - 50k

In [None]:
new_data['avg_transaction_value'].isna().any()

In [None]:
new_data.head()

In [None]:
dt = new_data.copy()

In [None]:
dt.shape

In [None]:
dt = dt.drop(new_data.loc[new_data['avg_frequency_login_days'] == 'Error'].index)

In [None]:
dt.shape

In [None]:
#plt.figure(figsize = (15,5))
#sns.distplot(new_data['avg_frequency_login_days'])

In [None]:
dt['avg_frequency_login_days'].mode()

In [None]:
dt['avg_frequency_login_days'].value_counts()

In [None]:
new_data['avg_frequency_login_days']

####  We can replace the 'Error' term in the 'avg_frequency_login_days' column by mode of 'avg_frequency_login_days' i.e 17

In [None]:
new_data.loc[new_data['avg_frequency_login_days'] == 'Error'].shape

In [None]:
new_data.loc[new_data['avg_frequency_login_days'] == 'Error']

In [None]:
#new_data['avg_frequency_login_days'] = new_data['avg_frequency_login_days'].replace('Error', 17)

In [None]:
new_data['avg_frequency_login_days'] = new_data['avg_frequency_login_days'].replace('Error', np.nan)

In [None]:
new_data.loc[new_data['avg_frequency_login_days'] == 'Error'].shape

In [None]:
plt.figure(figsize = (15,5))
sns.distplot(new_data['avg_frequency_login_days'])

#### 'avg_frequency_login_days' has negative values.

In [None]:
new_data['avg_frequency_login_days'] = new_data['avg_frequency_login_days'].astype(float)

In [None]:
new_data.loc[new_data['avg_frequency_login_days'] < 0 ].shape

In [None]:
condt = new_data['avg_frequency_login_days'] < 0 
new_data['avg_frequency_login_days'] = np.where((condt), np.nan, new_data['avg_frequency_login_days'])

In [None]:
new_data.loc[new_data['avg_frequency_login_days']<0].shape

In [None]:
new_data['avg_frequency_login_days'].dtype

In [None]:
new_data['avg_frequency_login_days'].isnull().sum()

In [None]:
new_data['avg_frequency_login_days'] = new_data['avg_frequency_login_days'].astype(float)

In [None]:
new_data['avg_frequency_login_days'].dtype

In [None]:
new_data.head()

In [None]:
plt.figure(figsize = (15,5))
sns.distplot(new_data['points_in_wallet'])

In [None]:
new_data[new_data['points_in_wallet'] < 0].shape

#### 'points_in_wallet' has negative values which are very less, so we replace the it by np.nan

In [None]:
condt = new_data['points_in_wallet'] < 0
condt

In [None]:
new_data['points_in_wallet'] = np.where((condt), np.nan, new_data['points_in_wallet'])

In [None]:
new_data[new_data['points_in_wallet'] < 0].shape

In [None]:
new_data.info()

In [None]:
#new_data = new_data.drop(new_data.loc[new_data['points_in_wallet'] < 0].index)
#new_data[new_data['points_in_wallet'] < 0].shape

In [None]:
plt.figure(figsize = (15,5))
sns.distplot(new_data['points_in_wallet'])

#### Majority people have 500-900 'points_in_wallet'

In [None]:
new_data['points_in_wallet'].isnull().sum()

In [None]:
from sklearn.impute import KNNImputer

In [None]:
new_data.columns

In [None]:
#imputer = KNNImputer(n_neighbors=3, missing_values=np.nan)

In [None]:
new_data['points_in_wallet'].dtype

In [None]:
new_data['used_special_discount'].value_counts().plot.pie(autopct='%1.1f%%')

#### 55% of the customer uses special discounts offered

In [None]:
sns.countplot(train_data['used_special_discount'], hue = train_data['churn_risk_score'])

In [None]:
new_data['used_special_discount'].value_counts()

In [None]:
new_data.info()

In [None]:
new_data['past_complaint'].value_counts()

In [None]:
new_data['past_complaint'].value_counts().plot.pie(autopct='%1.1f%%')

In [None]:
new_data['past_complaint'].unique()

In [None]:
new_data['past_complaint'].isnull().sum()

In [None]:
plt.figure(figsize = (15,5))
sns.countplot(new_data['past_complaint'], hue = new_data['churn_risk_score'])

In [None]:
new_data

In [None]:
new_data['complaint_status'].unique()

In [None]:
new_data['complaint_status'].value_counts()

#### For all the customers whose 'past_complaint' value is 'No', their 'complaint_status' value is 'Not Applicable'
#### For all the customers whose 'past_complaint' value is 'Yes', their 'complaint_status' value is either of [Solved in Follow-up, No Information Available, Unsolved, Solved]

In [None]:
plt.figure(figsize = (7, 7)) 
new_data['complaint_status'].value_counts().plot.pie(autopct = '%1.1f%%')

In [None]:
plt.figure(figsize = (15, 5)) 
sns.countplot(new_data['complaint_status'], hue = new_data['churn_risk_score'])

In [None]:
new_data['feedback'].value_counts()

In [None]:
new_data['feedback'].unique()

In [None]:
plt.figure(figsize = (8,8))
new_data['feedback'].value_counts().plot.pie(autopct = '%1.1f%%')

In [None]:
plt.figure(figsize = (15,5))
sns.countplot(new_data['feedback'], hue = new_data['churn_risk_score'])
plt.xticks(rotation = 90)
plt.show()

#### Customers with feedback -  [ 'Products always in Stock', 'Quality Customer Care', 'User Friendly Website', 'Reasonable Price' ] sums to 15.1%. That means only 15.1% of the customers with this feedback has 'churn_risk_score' value < 3.
#### Customers with feedback - ['Poor Website', 'No reason specified', 'Poor Product Quality','Poor Customer Service', 'Too many ads',] sums to 84.9%. That means 84.9% of the customers with this feedback has 'churn_risk_score' value >= 3.
#### We can make an observation from this, that only 15% of the customers has given +ve feedback, and the other 85% has given negative feedback

In [None]:
new_data

### Before handling missing values, we will handle the categorical columns 
##### Use Label Encoder to deal with the Categorical Data

In [None]:
new_data['gender'].value_counts()

In [None]:
new_data['gender'] = new_data['gender'].map({'F' : 0, 'M' : 1, 'Unknown' : 2})

In [None]:
new_data['gender'].value_counts()

In [None]:
new_data['region_category'].value_counts()

In [None]:
new_data['region_category'] = new_data['region_category'].map({'Town' : 0, 'City' : 1, 'Village' : 2})

In [None]:
new_data['region_category'].value_counts()

In [None]:
new_data['membership_category'].value_counts()

In [None]:
new_data['membership_category'] = new_data['membership_category'].map({'Basic Membership' : 0, 
                                                                       'No Membership' : 1, 
                                                                       'Gold Membership' : 2,
                                                                       'Silver Membership' : 3,
                                                                       'Premium Membership' : 4,
                                                                       'Platinum Membership' : 5})

In [None]:
new_data['membership_category'].value_counts()

In [None]:
new_data['joined_through_referral'].value_counts()

In [None]:
new_data['joined_through_referral'] = new_data['joined_through_referral'].map({'Yes' : 0, 'No' : 1})

In [None]:
new_data['joined_through_referral'].value_counts()

In [None]:
new_data['preferred_offer_types'].value_counts()

In [None]:
new_data['preferred_offer_types'] = new_data['preferred_offer_types'].map({'Credit/Debit Card Offers' : 0, 
                                                                           'Gift Vouchers/Coupons' : 1, 
                                                                           'Without Offers' : 2})

In [None]:
new_data['preferred_offer_types'].value_counts()

In [None]:
new_data['medium_of_operation'].value_counts()

In [None]:
new_data['medium_of_operation'] = new_data['medium_of_operation'].map({'Smartphone' : 0, 
                                                                           'Desktop' : 1, 
                                                                           'Both' : 2})

In [None]:
new_data['medium_of_operation'].value_counts()

In [None]:
new_data['internet_option'].value_counts()

In [None]:
new_data['internet_option'] = new_data['internet_option'].map({'Fiber_Optic' : 0,'Wi-Fi' : 1,'Mobile_Data' : 2})

In [None]:
new_data['internet_option'].value_counts()

In [None]:
print(new_data['used_special_discount'].value_counts())
print(new_data['offer_application_preference'].value_counts())
print(new_data['past_complaint'].value_counts())

In [None]:
new_data['used_special_discount'] = new_data['used_special_discount'].map({'Yes' : 0,'No' : 1})
new_data['offer_application_preference'] = new_data['offer_application_preference'].map({'Yes' : 0,'No' : 1})
new_data['past_complaint'] = new_data['past_complaint'].map({'Yes' : 0,'No' : 1})

In [None]:
print(new_data['used_special_discount'].value_counts())
print(new_data['offer_application_preference'].value_counts())
print(new_data['past_complaint'].value_counts())

In [None]:
new_data['complaint_status'].value_counts()

In [None]:
new_data['complaint_status'] = new_data['complaint_status'].map({'Not Applicable' : 0,'Solved in Follow-up' : 1, 
                                                              'No Information Available' : 2,'Unsolved' : 3, 'Solved' : 4})

In [None]:
new_data['complaint_status'].value_counts()

In [None]:
new_data['feedback'].value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

In [None]:
new_data['feedback'] = le.fit_transform(new_data['feedback'])

In [None]:
new_data['feedback'].value_counts()

In [None]:
new_data.info()

In [None]:
new_data.head()

In [None]:
def splt_fun_year(val):
    x = val.split('-')
    return x[0]

def splt_fun_month(val):
    x = val.split('-')
    return x[1]

def splt_fun_day(val):
    x = val.split('-')
    return x[2]

In [None]:
new_data['joining_date_year'] = new_data['joining_date'].apply(splt_fun_year)
new_data['joining_date_month'] = new_data['joining_date'].apply(splt_fun_month)
new_data['joining_date_day'] = new_data['joining_date'].apply(splt_fun_day)

In [None]:
new_data.drop('joining_date', axis = 1, inplace = True)

In [None]:
new_data.head()

In [None]:
new_data['joining_date_year'] = new_data['joining_date_year'].astype(int)
new_data['joining_date_month'] = new_data['joining_date_month'].astype(int)
new_data['joining_date_day'] = new_data['joining_date_day'].astype(int)

In [None]:
new_data.info()

In [None]:
new_data['last_visit_time']

In [None]:
def splt_fun_hr(val):
    x = val.split(':')
    return x[0]

def splt_fun_min(val):
    x = val.split(':')
    return x[1]

In [None]:
new_data['last_visit_hour'] = new_data['last_visit_time'].apply(splt_fun_hr)
new_data['last_visit_min'] = new_data['last_visit_time'].apply(splt_fun_min)

In [None]:
new_data.drop('last_visit_time',axis = 1, inplace = True)

In [None]:
new_data['last_visit_hour'] = new_data['last_visit_hour'].astype(int)
new_data['last_visit_min'] = new_data['last_visit_min'].astype(int)

In [None]:
new_data.head()

In [None]:
new_data.info()

### Now, handling missing values using KNN Imputer

In [None]:
from sklearn.impute import KNNImputer

In [None]:
imputer = KNNImputer(n_neighbors=3, missing_values=np.nan)

In [None]:
saved_data = new_data
#saved_data

In [None]:
#new_data = saved_data

In [None]:
new_data.loc[new_data['churn_risk_score'].notnull()]

In [None]:
train_data = new_data.loc[new_data['churn_risk_score'].notnull()]

In [None]:
train_data.shape

In [None]:
train_data.isna().sum()

In [None]:
#encode_data = pd.DataFrame(np.round(imputer.fit_transform(new_data)),
#                           columns = new_data.columns.drop(['churn_risk_score', 'Train']))

#encode_data = pd.DataFrame(np.round(imputer.fit_transform(new_data[['region_category', 'preferred_offer_types', 'points_in_wallet']])),
#                           columns = lst)

encode_data_train = pd.DataFrame(np.round(imputer.fit_transform(train_data)),
                           columns = train_data.columns)

In [None]:
encode_data_train.isnull().sum()

In [None]:
train_data = encode_data_train
train_data.shape

In [None]:
train_data.drop('Train', axis = 1, inplace = True)

In [None]:
train_data

In [None]:
#new_data = saved_data

In [None]:
new_data.loc[new_data['churn_risk_score'].isnull()]

In [None]:
test_data = new_data.loc[new_data['churn_risk_score'].isnull()]

In [None]:
print(train_data.shape, test_data.shape)

In [None]:
test_data.isna().sum()

In [None]:
test_data.drop('churn_risk_score', axis=1, inplace = True)

In [None]:
test_data.drop('Train', axis=1, inplace = True)

In [None]:
test_data.info()

In [None]:
encode_data_test = pd.DataFrame(np.round(imputer.fit_transform(test_data)),
                           columns = test_data.columns)

In [None]:
encode_data_test.isnull().sum()

In [None]:
test_data = encode_data_test
test_data.shape

In [None]:
print(train_data.shape, test_data.shape)

In [None]:
plt.figure(figsize = (25,10))
sns.heatmap(train_data.corr(), annot = True)

#### we can see that few independent variables shows strong negative correlations(>0.7), so we will drop either one of the features 

In [None]:
new_data.corr()['churn_risk_score']

In [None]:
plt.figure(figsize = (25,10))
sns.heatmap(test_data.corr(), annot = True)

In [None]:
#test_data.drop(['offer_application_preference', 'complaint_status'], axis = 1, inplace = True)
#test_data.drop(['used_special_discount', 'complaint_status'], axis = 1, inplace = True)

In [None]:
plt.figure(figsize = (25,10))
sns.heatmap(train_data.corr(), annot = True)

In [None]:
train_data

In [None]:
train_data.corr()['churn_risk_score']

In [None]:
saved_train_data = train_data
saved_test_data = test_data

In [None]:
#train_data = saved_train_data
#test_data = saved_test_data

#### Now, Our data is ready for training. So, we will split the training and testing data from the new_data

In [None]:
train_data

In [None]:
train_data.drop(['last_visit_hour','last_visit_min'], axis = 1, inplace = True)
test_data.drop(['last_visit_hour','last_visit_min'], axis = 1, inplace = True)

In [None]:
train_data.drop(['used_special_discount','complaint_status'], axis = 1, inplace = True)
test_data.drop(['used_special_discount','complaint_status'], axis = 1, inplace = True)

In [None]:
print(train_data.shape, test_data.shape)

## Model Training

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score
from xgboost import XGBClassifier

In [None]:
x = train_data.drop('churn_risk_score', axis = 1)
x

In [None]:
y = train_data['churn_risk_score']
y

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, random_state = 101, test_size = .2)

In [None]:
print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape)

In [None]:
new_data.corr()['churn_risk_score']

### XGBoost Classifier

In [None]:
xgb_classifier = XGBClassifier(random_state = 100)
xgb_classifier.fit(xtrain, ytrain)

In [None]:
#xgb_classifier.get_params() #default paramters

In [None]:
xgb_ypred = xgb_classifier.predict(xtest)

In [None]:
from sklearn.metrics import f1_score

In [None]:
score = 100 * f1_score(ytest, xgb_ypred, average="macro")
score

### Random Forest Classifier

In [None]:
rf_classifier = RandomForestClassifier(criterion= 'entropy',
                                       n_estimators = 90,random_state=100)
rf_classifier.fit(xtrain, ytrain)

In [None]:
#rf_classifier.get_params()

In [None]:
rf_ypred = rf_classifier.predict(xtest)

In [None]:
score = 100 * f1_score(ytest, rf_ypred, average="macro")
score

### KNN

##### Performing Standard Scaling for distance Based Algorithms

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [None]:
sc = StandardScaler()

In [None]:
xtrain_sc = sc.fit_transform(xtrain)
xtrain_sc

In [None]:
xtest_sc = sc.transform(xtest)

In [None]:
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(xtrain_sc,ytrain)

In [None]:
knn_classifier.get_params()

In [None]:
knn_ypred = knn_classifier.predict(xtest_sc)

In [None]:
score = 100 * f1_score(ytest, knn_ypred, average="macro")
score

In [None]:
pip install statsmodels

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
 

vif = pd.DataFrame()
vif["vif"] = [variance_inflation_factor(xtrain_sc,i) for i in range(xtrain_sc.shape[1])]
vif["Features"] = x.columns

#let's check the values
vif

### SVM

In [None]:
svm_classifier = SVC(kernel='rbf', random_state = 10)
svm_classifier.fit(xtrain_sc, ytrain)

In [None]:
svm_ypred = svm_classifier.predict(xtest_sc)

In [None]:
score = 100 * f1_score(ytest, svm_ypred, average="macro")
score

### LGBM Classifier

In [None]:
pip install lightgbm

In [None]:
import lightgbm as lgb

In [None]:
lgb_classifier = lgb.LGBMClassifier(objective='multi', random_state=1, n_jobs=-1, 
                               learning_rate=0.15, 
                               n_estimators=100)

In [None]:
#lgb_classifier.get_params()

In [None]:
lgb_classifier.fit(xtrain, ytrain)

In [None]:
lgb_ypred = lgb_classifier.predict(xtest)

In [None]:
score = 100 * f1_score(ytest, lgb_ypred, average="macro")
score

### Cat Boost

In [None]:
pip install catboost

In [None]:
import catboost as cb

In [None]:
cb_classifier = cb.CatBoostClassifier(verbose=0, iterations=100)

In [None]:
cb_classifier.get_params()

In [None]:
cb_classifier.fit(xtrain, ytrain)

In [None]:
cb_ypred = cb_classifier.predict(xtest)

In [None]:
score = 100 * f1_score(ytest, cb_ypred, average="macro")
score

### Hyperparameter Tuning

### XGB

In [None]:
trial

In [None]:
from scipy.stats import randint as sp_randint
"""param_grid={
   
    'learning_rate':[0.5,0.1,0.01,0.001,1],
    'max_depth': sp_randint(5,200),
    'n_estimators':sp_randint(10,300), #[100,200,250,300,350],
    'min_child_weight' : sp_randint(1,20),
    'gamma' : [0.1, 0.2, 0.3], 
    'subsample': np.arange(0.5, 1, 0.1)
}"""

param_grid={ 
    'learning_rate':[0.5,0.1,0.01,0.001,1],
    'max_depth': sp_randint(5,100),
    'n_estimators':sp_randint(10,100), #[100,200,250,300,350],
    'min_child_weight' : sp_randint(1,20),
    'gamma' : [0.1, 0.2, 0.3], 
    'subsample': np.arange(0.5, 1, 0.1)
    }

In [None]:
grid_xgb = RandomizedSearchCV(estimator=xgb_classifier , param_distributions = param_grid, n_jobs=-1, verbose = 3, cv = 10
                              , n_iter = 10, scoring='f1_macro')

In [None]:
grid_xgb.fit(xtrain, ytrain)

In [None]:
best_parameters = grid_xgb.best_params_
print(best_parameters)

In [None]:
#{'gamma': 0.1, 'learning_rate': 0.01, 'max_depth': 149, 'min_child_weight': 10, 'n_estimators': 103, 'subsample': 0.7}
new_xgb_classifier = XGBClassifier(n_estimators = 103,
                                          min_child_weight = 10,
                                          gamma =  0.1,
                                          learning_rate = 0.01,
                                          max_depth = 149,
                                          subsample = 0.7,)
                                          #objective ='multi:softprob')

In [None]:
new_xgb_classifier.fit(xtrain, ytrain)

In [None]:
new_xgb_ypred = new_xgb_classifier.predict(xtest)

In [None]:
score = 100 * f1_score(ytest, new_xgb_ypred, average="macro")
score

### Random Forest Classifier

In [None]:
import random
from scipy.stats import randint as sp_randint
#random.randint(3, 9)
param_grid = {
    "n_estimators" : sp_randint(50,500),#[90,100,115,130],
    'criterion': ['gini', 'entropy'],
    'max_depth' : sp_randint(2,20),
    'min_samples_leaf' : sp_randint(1,10),
    'min_samples_split': sp_randint(2,10),
    'max_features' : ['auto','log2']
}

In [None]:
grid_rf = RandomizedSearchCV(estimator=rf_classifier , param_distributions = param_grid, n_jobs=-1, verbose = 3, cv = 10, 
                             random_state = 100, n_iter = 12, scoring='f1_macro')#'neg_mean_absolute_error')

In [None]:
grid_rf.fit(xtrain, ytrain)

In [None]:
best_parameters = grid_rf.best_params_
print(best_parameters)

In [None]:
new_rf_classifier = RandomForestClassifier(n_estimators = 217,
                                          min_samples_split = 9,
                                          min_samples_leaf =  1,
                                          max_features = 'log2',
                                          max_depth = 16,
                                          criterion = 'entropy',
                                           class_weight='balanced_subsample',)

In [None]:
new_rf_classifier.fit(xtrain, ytrain)

In [None]:
new_rg_ypred = new_rf_classifier.predict(xtest)

In [None]:
score = 100 * f1_score(ytest, new_rg_ypred, average="macro")
score

### Light Gradient Boosting

In [None]:
lgb_classifier.get_params()

In [None]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
param_grid = {
             'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(10, 300), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100, 150, 200],
    
             #'max_depth': sp_randint(5,100), #add
             #'n_estimators':sp_randint(10,300,50),  #add           
}

In [None]:
grid_lgb = RandomizedSearchCV(estimator=lgb_classifier , param_distributions = param_grid, n_jobs=-1, verbose = 3, cv = 10, 
                              n_iter = 10, scoring='f1_macro')#'neg_mean_absolute_error')

In [None]:
grid_lgb.fit(xtrain, ytrain)

In [None]:
best_parameters = grid_lgb.best_params_
print(best_parameters)

In [None]:
# best
"""new_lgb_classifier = lgb.LGBMClassifier(colsample_bytree=0.9640178917734248,
                                           min_child_samples=149,
                                           min_child_weight=0.001,
                                           num_leaves = 20,
                                           reg_alpha = 50,
                                           reg_lambda = 100,
                                           subsample = 0.34289945963435475)""" # best

new_lgb_classifier = lgb.LGBMClassifier(colsample_bytree=0.6891239292918483,
                                           min_child_samples=295,
                                           min_child_weight=10,
                                           num_leaves = 6,
                                           reg_alpha = 10,
                                           reg_lambda = 150,
                                           subsample = 0.8721158617161302,
                                            #max_depth = 53
                                        )

new_lgb_classifier.fit(xtrain, ytrain)
new_lgb_ypred = new_lgb_classifier.predict(xtest)
score = 100 * f1_score(ytest, new_lgb_ypred, average="macro")
score

In [None]:
#new_lgb_classifier.fit(xtrain, ytrain)

In [None]:
#new_lgb_ypred = new_lgb_classifier.predict(xtest)

In [None]:
#score = 100 * f1_score(ytest, new_lgb_ypred, average="macro")
#score

### CatBoost

In [None]:
cb_classifier.get_params()

In [None]:
param_grid = {#'depth': [4, 7, 10],
          #'learning_rate' : [0.03, 0.1, 0.15],
         'l2_leaf_reg': [1,4,9],
         'iterations': range(50,300,50),
            #added
         'learning_rate': np.arange(0.03, 0.1, 0.05),
         'max_depth': np.arange(3, 15, 1),
         'colsample_bylevel':  np.arange(0.3, 0.8, 0.1),
    #'n_estimators': range(50,300,50),
             }
#iterations, n_estimators, num_boost_round, num_trees

In [None]:
grid_cb = RandomizedSearchCV(estimator=cb_classifier , param_distributions = param_grid, n_jobs=-1, verbose = 3, cv = 10, 
                             random_state = 100, n_iter = 12, scoring='f1_macro')#'neg_mean_absolute_error')

In [None]:
grid_cb.fit(xtrain, ytrain)

In [None]:
best_parameters = grid_cb.best_params_
print(best_parameters)

In [None]:
"""new_cb_classifier = cb.CatBoostClassifier(depth = 4,
                                         learning_rate = 0.1,
                                         l2_leaf_reg = 4,
                                         iterations = 300)""" # best

new_cb_classifier = cb.CatBoostClassifier(max_depth = 5,
                                         learning_rate = 0.08,
                                         l2_leaf_reg = 4,
                                         iterations = 100,
                                         colsample_bylevel = 0.7000000000000002)

In [None]:
new_cb_classifier.fit(xtrain, ytrain)

In [None]:
new_cb_ypred = new_cb_classifier.predict(xtest)

In [None]:
score = 100 * f1_score(ytest, new_cb_ypred, average="macro")
score

### Clustering Approach

In [None]:
pip install kneed

In [None]:
from sklearn.cluster import KMeans
from kneed import KneeLocator

In [None]:
# Finding Optimal K Value
wcss = []
for i in range(1,15):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(train_data)
    wcss.append(kmeans.inertia_)
    
plt.plot(range(1,15),wcss) # creating the graph between WCSS and the number of clusters
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')

In [None]:
kn = KneeLocator(range(1, 15), wcss, curve='convex', direction='decreasing')

In [None]:
no_of_clusters = kn.knee
no_of_clusters

In [None]:
kmeans = KMeans(n_clusters = no_of_clusters, init = 'k-means++', random_state = 1)
y_kmeans = kmeans.fit_predict(train_data)

In [None]:
y_kmeans

In [None]:
train_data['Clusters'] = y_kmeans

In [None]:
list_of_clusters = train_data['Clusters'].unique()
list_of_clusters

In [None]:
train_data

In [None]:
import numpy as np

In [None]:
def xgb_function(xtrain, ytrain):
    #xgb_classifier = XGBClassifier()
    xgbclf = XGBClassifier(n_estimators = 50,
                                          min_child_weight = 7,
                                          gamma =  0.1,
                                          learning_rate = 0.5,
                                          max_depth = 5,
                                          subsample = 0.8999999999999999,
                                          objective ='multi:softprob')
    xgbclf.fit(xtrain, ytrain)
    return xgbclf

def lgb_function(xtrain, ytrain):
    lgbclf = lgb.LGBMClassifier(colsample_bytree=0.9640178917734248,
                                           min_child_samples=149,
                                           min_child_weight=0.001,
                                           num_leaves = 20,
                                           reg_alpha = 50,
                                           reg_lambda = 100,
                                           subsample = 0.34289945963435475)
    lgbclf.fit(xtrain, ytrain)
    return lgbclf

def cb_function(xtrain, ytrain):
    cbclf = cb.CatBoostClassifier(depth = 4,
                                         learning_rate = 0.1,
                                         l2_leaf_reg = 4,
                                         iterations = 300)
    cbclf.fit(xtrain, ytrain)
    return cbclf

for i in list_of_clusters:
    cluster_data = train_data[train_data['Clusters']==i]
    
    cluster_features = cluster_data.drop(['Clusters', 'churn_risk_score'], axis = 1)  # X
    cluster_label = cluster_data['churn_risk_score']  # Y
    
    xtrain, xtest, ytrain, ytest = train_test_split(cluster_features, cluster_label, test_size = 1/3, random_state = 355)
    
    xgb_fun = xgb_function(xtrain, ytrain)
    xgb_pred = xgb_fun.predict(xtest)
    xgb_score = 100 * f1_score(ytest, xgb_pred, average="macro")
    
    lgb_fun = lgb_function(xtrain, ytrain)
    lgb_pred = lgb_fun.predict(xtest)
    lgb_score = 100 * f1_score(ytest, lgb_pred, average="macro")
    
    cb_fun = cb_function(xtrain, ytrain)
    cb_pred = cb_fun.predict(xtest)
    cb_score = 100 * f1_score(ytest, cb_pred, average="macro")
    
    if ((xgb_score > lgb_score) & (xgb_score > cb_score)):
        print('CLUSTER ',i, ' XGB SCORE: ', xgb_score)
    elif ((lgb_score > xgb_score) & (lgb_score > cb_score)):
        print('CLUSTER ',i, ' LGB SCORE: ', lgb_score)
    else:
        print('CLUSTER ',i, ' CB SCORE: ', cb_score)


In [None]:
#for test data
test_kmeans = KMeans(n_clusters = no_of_clusters, init = 'k-means++', random_state = 1)
test_y_kmeans = kmeans.fit_predict(test_data)

In [None]:
test_y_kmeans

In [None]:
test_data['Clusters'] = test_y_kmeans

In [None]:
test_data['Clusters'] .unique()

In [None]:
test_list_of_clusters = test_data['Clusters'].unique()
test_list_of_clusters

In [None]:
test_data

### Submission

In [None]:
sub_csv = new_lgb_classifier.predict(test_data)

In [None]:
sub_csv

In [None]:
#sub_csv = sub_csv.reshape(-1) #for catboost only
sub_csv

In [None]:
submission_df = pd.DataFrame(columns=['customer_id', 'churn_risk_score'])

In [None]:
submission_df

In [None]:
submission_df['churn_risk_score'] = sub_csv
submission_df.shape

In [None]:
submission_df['customer_id'] = test_customer_id

In [None]:
submission_df

In [None]:
submission_df.to_csv('HE_HackathonSubmission_3-04_lgb_classifier.csv')

In [None]:
test_data

In [None]:
# best model - HE_HackathonSubmission_lgb_hypertuned_classifier.csv - 76.30572