In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import OrdinalEncoder

from tqdm import tqdm
import sys
import warnings
warnings.filterwarnings("ignore")

# Reading Data

In [50]:
data = pd.read_csv('in-vehicle-coupon-recommendation.csv')
data.head()

Unnamed: 0,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
0,No Urgent Place,Alone,Sunny,55,2PM,Restaurant(<20),1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,1
1,No Urgent Place,Friend(s),Sunny,80,10AM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,0
2,No Urgent Place,Friend(s),Sunny,80,10AM,Carry out & Take away,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,1
3,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0
4,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0


In [51]:
print('Number of Data Points: ', data.shape[0])
print('Number of features: ', data.shape[1])
print('-'*100)
print('The attributes of data: ', data.columns.values)

Number of Data Points:  12684
Number of features:  26
----------------------------------------------------------------------------------------------------
The attributes of data:  ['destination' 'passanger' 'weather' 'temperature' 'time' 'coupon'
 'expiration' 'gender' 'age' 'maritalStatus' 'has_children' 'education'
 'occupation' 'income' 'car' 'Bar' 'CoffeeHouse' 'CarryAway'
 'RestaurantLessThan20' 'Restaurant20To50' 'toCoupon_GEQ5min'
 'toCoupon_GEQ15min' 'toCoupon_GEQ25min' 'direction_same' 'direction_opp'
 'Y']


In [52]:
Y_value_counts = data.groupby('Y').Y.count()
print('The number of users that are accepted the coupon is ',Y_value_counts[1],',',round(Y_value_counts[1]/data.shape[0]*100,3),'%')
print('The number of users that are rejected the coupon is ',Y_value_counts[0],',',round(Y_value_counts[0]/data.shape[0]*100,3),'%')

The number of users that are accepted the coupon is  7210 , 56.843 %
The number of users that are rejected the coupon is  5474 , 43.157 %


In [53]:
# data.info()

# Data Cleaning 

In [54]:
# Remove duplicates
duplicate = data[data.duplicated(keep = 'last')]
# duplicate.shape #(74, 26)
data = data.drop_duplicates()
print(data.shape)

(12610, 26)


### Missing Values

In [55]:
# missing values
print('Is there any missing value present or not?',data.isnull().values.any())
missing_percentage = data.isnull().sum()*100/len(data)
missing_value_df = pd.DataFrame({'missing_count': data.isnull().sum(),'missing_percentage': missing_percentage})
missing_value_df[missing_value_df.missing_count != 0]

Is there any missing value present or not? True


Unnamed: 0,missing_count,missing_percentage
car,12502,99.143537
Bar,107,0.848533
CoffeeHouse,217,1.720856
CarryAway,150,1.189532
RestaurantLessThan20,129,1.022998
Restaurant20To50,189,1.49881


- Feature 'car' has 99% of the missing value, drop this feature because even after predicting missing values, this feature has less importance, and it has less predictive power.

In [56]:
data = data.drop(['car'], axis=1)

### Correlation of Features

In [57]:
data.corr() # Co-variance matrix

Unnamed: 0,temperature,has_children,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
temperature,1.0,-0.018599,,-0.157089,-0.227165,0.097972,-0.097972,0.059393
has_children,-0.018599,1.0,,0.079434,-0.010773,-0.032353,0.032353,-0.045056
toCoupon_GEQ5min,,,,,,,,
toCoupon_GEQ15min,-0.157089,0.079434,,1.0,0.32126,-0.302066,0.302066,-0.082693
toCoupon_GEQ25min,-0.227165,-0.010773,,0.32126,1.0,-0.1899,0.1899,-0.108139
direction_same,0.097972,-0.032353,,-0.302066,-0.1899,1.0,-1.0,0.014932
direction_opp,-0.097972,0.032353,,0.302066,0.1899,-1.0,1.0,-0.014932
Y,0.059393,-0.045056,,-0.082693,-0.108139,0.014932,-0.014932,1.0


1.Feature ‘direction_same’ is perfectly correlated with ‘direction_opp’, both have the same variance.

2.‘toCoupon_GEQ5min’ feature has no correlation with any feature because it has the same value ‘1’ for all data points, which means all the restaurants/bars are at least more than five minutes away from the driver.

So, drop both 'direction_opp' and 'toCoupon_GEQ5min' features.

In [58]:
data = data.drop(['direction_opp', 'toCoupon_GEQ5min'], axis=1)
data.shape

(12610, 23)

In [59]:
data.describe()

Unnamed: 0,temperature,has_children,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,Y
count,12610.0,12610.0,12610.0,12610.0,12610.0,12610.0
mean,63.267248,0.414512,0.559794,0.116019,0.215543,0.567565
std,19.153386,0.492657,0.496432,0.32026,0.411215,0.495434
min,30.0,0.0,0.0,0.0,0.0,0.0
25%,55.0,0.0,0.0,0.0,0.0,0.0
50%,80.0,0.0,1.0,0.0,0.0,1.0
75%,80.0,1.0,1.0,0.0,0.0,1.0
max,80.0,1.0,1.0,1.0,1.0,1.0


### Analysis of 'direction_same' attribute

In [60]:
# Analysis of 'direction_same' feature
df = pd.DataFrame(sorted(list(data['direction_same'].unique())),columns=['direction_same'])
df['Total_Count'] = list(data.groupby('direction_same').Y.count())
df['Total_%'] = round(df['Total_Count']/data['direction_same'].shape[0]*100,3)
df['Accepted'] = list(data[data.Y==1].groupby('direction_same').Y.count())
df['Rejected'] = list(data[data.Y==0].groupby('direction_same').Y.count())
df['%Accepted'] = round(df['Accepted']/df['Total_Count']*100,3)
df['%Rejected'] = round(df['Rejected']/df['Total_Count']*100,3)
df

Unnamed: 0,direction_same,Total_Count,Total_%,Accepted,Rejected,%Accepted,%Rejected
0,0,9892,78.446,5576,4316,56.369,43.631
1,1,2718,21.554,1581,1137,58.168,41.832


- direction_same feature has 78% value is '0', and 22% value is '1'. Both value has almost similar acceptance ratio. so this feature not more usefull. So, drop 'direction_same' feature.

In [61]:
data = data.drop(['direction_same'], axis=1)
data.shape

(12610, 22)

# Train-Test Split

In [62]:
X = data.drop(['Y'], axis=1)
y = data['Y'].values

In [63]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(10088, 21) (10088,)
(2522, 21) (2522,)


In [64]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10088 entries, 10387 to 9948
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   destination           10088 non-null  object
 1   passanger             10088 non-null  object
 2   weather               10088 non-null  object
 3   temperature           10088 non-null  int64 
 4   time                  10088 non-null  object
 5   coupon                10088 non-null  object
 6   expiration            10088 non-null  object
 7   gender                10088 non-null  object
 8   age                   10088 non-null  object
 9   maritalStatus         10088 non-null  object
 10  has_children          10088 non-null  int64 
 11  education             10088 non-null  object
 12  occupation            10088 non-null  object
 13  income                10088 non-null  object
 14  Bar                   9999 non-null   object
 15  CoffeeHouse           9917 non-nu

In [65]:
print('Is there any missing value present or not?',X_train.isnull().values.any())

Is there any missing value present or not? True


# Mode Imputation

In [66]:
# mode imputation for missing values in train data
X_train['Bar'] = X_train['Bar'].fillna(X_train['Bar'].value_counts().index[0])
X_train['CoffeeHouse'] = X_train['CoffeeHouse'].fillna(X_train['CoffeeHouse'].value_counts().index[0])
X_train['CarryAway'] = X_train['CarryAway'].fillna(X_train['CarryAway'].value_counts().index[0])
X_train['RestaurantLessThan20'] = X_train['RestaurantLessThan20'].fillna(X_train['RestaurantLessThan20'].value_counts().index[0])
X_train['Restaurant20To50'] = X_train['Restaurant20To50'].fillna(X_train['Restaurant20To50'].value_counts().index[0])

In [67]:
# mode imputation for missing values in test data
X_test['Bar'] = X_test['Bar'].fillna(X_train['Bar'].value_counts().index[0])
X_test['CoffeeHouse'] = X_test['CoffeeHouse'].fillna(X_train['CoffeeHouse'].value_counts().index[0])
X_test['CarryAway'] = X_test['CarryAway'].fillna(X_train['CarryAway'].value_counts().index[0])
X_test['RestaurantLessThan20'] = X_test['RestaurantLessThan20'].fillna(X_train['RestaurantLessThan20'].value_counts().index[0])
X_test['Restaurant20To50'] = X_test['Restaurant20To50'].fillna(X_train['Restaurant20To50'].value_counts().index[0])

In [68]:
print('Is there any missing value present in X_train?',X_train.isnull().values.any())

Is there any missing value present in X_train? False


In [69]:
print('Is there any missing value present in X_test?',X_test.isnull().values.any())

Is there any missing value present in X_test? False


# Feature Engineering

#### FE -- to_Coupon

In [70]:
# FE -- to_Coupon is combination of two features, toCoupon_GEQ15min and toCoupon_GEQ25min
to_Coupon = []
for i in range(X_train.shape[0]):
    if (list(X_train['toCoupon_GEQ15min'])[i] == 0):
        to_Coupon.append(0)
    elif (list(X_train['toCoupon_GEQ15min'])[i] == 1)and(list(X_train['toCoupon_GEQ25min'])[i] == 0):
        to_Coupon.append(1)
    else:
        to_Coupon.append(2)
        
X_train['to_Coupon'] = to_Coupon
print('Unique values:',X_train['to_Coupon'].unique())
print('-'*50)
X_train['to_Coupon'].describe()

Unique values: [0 2 1]
--------------------------------------------------


count    10088.000000
mean         0.673672
std          0.670863
min          0.000000
25%          0.000000
50%          1.000000
75%          1.000000
max          2.000000
Name: to_Coupon, dtype: float64

In [71]:
# FE -- to_Coupon is combination of two features, toCoupon_GEQ15min and toCoupon_GEQ25min
to_Coupon = []
for i in range(X_test.shape[0]):
    if (list(X_test['toCoupon_GEQ15min'])[i] == 0):
        to_Coupon.append(0)
    elif (list(X_test['toCoupon_GEQ15min'])[i] == 1)and(list(X_test['toCoupon_GEQ25min'])[i] == 0):
        to_Coupon.append(1)
    else:
        to_Coupon.append(2)
        
X_test['to_Coupon'] = to_Coupon
print('Unique values:',X_test['to_Coupon'].unique())
print('-'*50)
X_test['to_Coupon'].describe()

Unique values: [1 0 2]
--------------------------------------------------


count    2522.000000
mean        0.684377
std         0.675039
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         2.000000
Name: to_Coupon, dtype: float64

#### FE--coupon_freq

In [72]:
# FE -- coupon_freq is combination of five features, RestaurantLessThan20, CoffeeHouse, CarryAway, Bar, Restaurant20To50
coupon_freq = []
for i in range(X_train.shape[0]):
    if (list(X_train['coupon'])[i] == 'Restaurant(<20)'):
        coupon_freq.append(list(X_train['RestaurantLessThan20'])[i])
    elif (list(X_train['coupon'])[i] == 'Coffee House'):
        coupon_freq.append(list(X_train['CoffeeHouse'])[i])
    elif (list(X_train['coupon'])[i] == 'Carry out & Take away'):
        coupon_freq.append(list(X_train['CarryAway'])[i])
    elif (list(X_train['coupon'])[i] == 'Bar'):
        coupon_freq.append(list(X_train['Bar'])[i])
    elif (list(X_train['coupon'])[i] == 'Restaurant(20-50)'):
        coupon_freq.append(list(X_train['Restaurant20To50'])[i])
        
X_train['coupon_freq'] = coupon_freq
print('Unique values:',X_train['coupon_freq'].unique())
print('-'*50)
X_train['coupon_freq'].describe()

Unique values: ['4~8' '1~3' 'less1' 'never' 'gt8']
--------------------------------------------------


count     10088
unique        5
top         1~3
freq       3110
Name: coupon_freq, dtype: object

In [73]:
# FE -- coupon_freq is combination of five features, RestaurantLessThan20, CoffeeHouse, CarryAway, Bar, Restaurant20To50
coupon_freq = []
for i in range(X_test.shape[0]):
    if (list(X_test['coupon'])[i] == 'Restaurant(<20)'):
        coupon_freq.append(list(X_test['RestaurantLessThan20'])[i])
    elif (list(X_test['coupon'])[i] == 'Coffee House'):
        coupon_freq.append(list(X_test['CoffeeHouse'])[i])
    elif (list(X_test['coupon'])[i] == 'Carry out & Take away'):
        coupon_freq.append(list(X_test['CarryAway'])[i])
    elif (list(X_test['coupon'])[i] == 'Bar'):
        coupon_freq.append(list(X_test['Bar'])[i])
    elif (list(X_test['coupon'])[i] == 'Restaurant(20-50)'):
        coupon_freq.append(list(X_test['Restaurant20To50'])[i])
        
X_test['coupon_freq'] = coupon_freq
print('Unique values:',X_test['coupon_freq'].unique()) 
print('-'*50)
X_test['coupon_freq'].describe()

Unique values: ['less1' '1~3' '4~8' 'never' 'gt8']
--------------------------------------------------


count     2522
unique       5
top        1~3
freq       760
Name: coupon_freq, dtype: object

#### FE -- occupation_class

In [74]:
X_train['occupation'].describe()

count          10088
unique            25
top       Unemployed
freq            1484
Name: occupation, dtype: object

In [75]:
# occupation feature has 25 no of distinct values, which creates very sparsity in data after Encoding
# FE -- occupation_class where categorize all occupation in its suitable class.
occupation_dict = {'Healthcare Support':'High_Acceptance','Construction & Extraction':'High_Acceptance','Healthcare Practitioners & Technical':'High_Acceptance',
                   'Protective Service':'High_Acceptance','Architecture & Engineering':'High_Acceptance','Production Occupations':'Medium_High_Acceptance',
                    'Student':'Medium_High_Acceptance','Office & Administrative Support':'Medium_High_Acceptance','Transportation & Material Moving':'Medium_High_Acceptance',
                    'Building & Grounds Cleaning & Maintenance':'Medium_High_Acceptance','Management':'Medium_Acceptance','Food Preparation & Serving Related':'Medium_Acceptance',
                   'Life Physical Social Science':'Medium_Acceptance','Business & Financial':'Medium_Acceptance','Computer & Mathematical':'Medium_Acceptance',
                    'Sales & Related':'Medium_Low_Acceptance','Personal Care & Service':'Medium_Low_Acceptance','Unemployed':'Medium_Low_Acceptance',
                   'Farming Fishing & Forestry':'Medium_Low_Acceptance','Installation Maintenance & Repair':'Medium_Low_Acceptance','Education&Training&Library':'Low_Acceptance',
                    'Arts Design Entertainment Sports & Media':'Low_Acceptance','Community & Social Services':'Low_Acceptance','Legal':'Low_Acceptance','Retired':'Low_Acceptance'}
# occupation_dict
X_train['occupation_class'] = X_train['occupation'].map(occupation_dict)
print('Unique values:',X_train['occupation_class'].unique())
print('-'*50)
X_train['occupation_class'].describe()
# X_train['occupation_class'].value_counts()

Unique values: ['Low_Acceptance' 'Medium_Acceptance' 'High_Acceptance'
 'Medium_Low_Acceptance' 'Medium_High_Acceptance']
--------------------------------------------------


count                     10088
unique                        5
top       Medium_Low_Acceptance
freq                       2635
Name: occupation_class, dtype: object

In [76]:
X_test['occupation'].describe()

count           2522
unique            25
top       Unemployed
freq             377
Name: occupation, dtype: object

In [77]:
# occupation feature has 25 no of distinct values, which creates very sparsity in data after Encoding
# FE -- occupation_class where categorize all occupation in its suitable class.
occupation_dict = {'Healthcare Support':'High_Acceptance','Construction & Extraction':'High_Acceptance','Healthcare Practitioners & Technical':'High_Acceptance',
                   'Protective Service':'High_Acceptance','Architecture & Engineering':'High_Acceptance','Production Occupations':'Medium_High_Acceptance',
                    'Student':'Medium_High_Acceptance','Office & Administrative Support':'Medium_High_Acceptance','Transportation & Material Moving':'Medium_High_Acceptance',
                    'Building & Grounds Cleaning & Maintenance':'Medium_High_Acceptance','Management':'Medium_Acceptance','Food Preparation & Serving Related':'Medium_Acceptance',
                   'Life Physical Social Science':'Medium_Acceptance','Business & Financial':'Medium_Acceptance','Computer & Mathematical':'Medium_Acceptance',
                    'Sales & Related':'Medium_Low_Acceptance','Personal Care & Service':'Medium_Low_Acceptance','Unemployed':'Medium_Low_Acceptance',
                   'Farming Fishing & Forestry':'Medium_Low_Acceptance','Installation Maintenance & Repair':'Medium_Low_Acceptance','Education&Training&Library':'Low_Acceptance',
                    'Arts Design Entertainment Sports & Media':'Low_Acceptance','Community & Social Services':'Low_Acceptance','Legal':'Low_Acceptance','Retired':'Low_Acceptance'}
# occupation_dict
X_test['occupation_class'] = X_test['occupation'].map(occupation_dict)
print('Unique values:',X_test['occupation_class'].unique())
print('-'*50)
X_test['occupation_class'].describe()
# X_test['occupation_class'].value_counts()

Unique values: ['Low_Acceptance' 'Medium_Acceptance' 'Medium_Low_Acceptance'
 'Medium_High_Acceptance' 'High_Acceptance']
--------------------------------------------------


count                      2522
unique                        5
top       Medium_Low_Acceptance
freq                        665
Name: occupation_class, dtype: object

In [78]:
# After Feature Engineering, removing unwanted features
X_train = X_train.drop(['toCoupon_GEQ15min','toCoupon_GEQ25min','Bar','CoffeeHouse','CarryAway','RestaurantLessThan20','Restaurant20To50','occupation'], axis=1)
X_test = X_test.drop(['toCoupon_GEQ15min','toCoupon_GEQ25min','Bar','CoffeeHouse','CarryAway','RestaurantLessThan20','Restaurant20To50','occupation'], axis=1)
print('X_train:',X_train.shape,'\nX_test:',X_test.shape)
print('-'*50)
print(X_train.columns.values)

X_train: (10088, 16) 
X_test: (2522, 16)
--------------------------------------------------
['destination' 'passanger' 'weather' 'temperature' 'time' 'coupon'
 'expiration' 'gender' 'age' 'maritalStatus' 'has_children' 'education'
 'income' 'to_Coupon' 'coupon_freq' 'occupation_class']


# Encoding

### Ordinal Encoding

In [79]:
order = [['Work','Home','No Urgent Place'],['Kid(s)','Alone','Partner','Friend(s)'],['Rainy','Snowy','Sunny'],[30,55,80],['7AM','10AM','2PM','6PM','10PM'],
         ['Bar','Restaurant(20-50)','Coffee House','Restaurant(<20)','Carry out & Take away'],['2h','1d'],['Female','Male'],['below21','21','26','31','36','41','46','50plus'],
         ['Widowed','Divorced','Married partner','Unmarried partner','Single'],
         ['Some High School','High School Graduate','Some college - no degree','Associates degree','Bachelors degree','Graduate degree (Masters or Doctorate)'],
         ['Less than $12500','$12500 - $24999','$25000 - $37499','$37500 - $49999','$50000 - $62499','$62500 - $74999','$75000 - $87499','$87500 - $99999','$100000 or More'],
         ['never','less1','1~3','4~8','gt8'],['Low_Acceptance','Medium_Low_Acceptance','Medium_Acceptance','Medium_High_Acceptance','High_Acceptance']]

In [80]:
Ordinal_enc = OrdinalEncoder(categories=order)
X_train_Ordinal_encoding = Ordinal_enc.fit_transform(X_train.drop(['has_children','to_Coupon'], axis=1)) # 'has_children' and 'to_Coupon' are numeric features

X_train_Ordinal_encoding = pd.DataFrame(X_train_Ordinal_encoding,columns=['destination','passanger','weather','temperature','time','coupon','expiration',
                                                                          'gender','age','maritalStatus','education','income','coupon_freq','occupation_class'])

X_train_Ordinal_encoding['has_children'] = X_train['has_children']
X_train_Ordinal_encoding['to_Coupon'] = X_train['to_Coupon']

print('X_train_Ordinal_encoding:',X_train_Ordinal_encoding.shape)

X_train_Ordinal_encoding: (10088, 16)


In [81]:
Ordinal_enc = OrdinalEncoder(categories=order)
X_test_Ordinal_encoding = Ordinal_enc.fit_transform(X_test.drop(['has_children','to_Coupon'], axis=1)) # 'has_children' and 'to_Coupon' are numeric features

X_test_Ordinal_encoding = pd.DataFrame(X_test_Ordinal_encoding,columns=['destination','passanger','weather','temperature','time','coupon','expiration',
                                                                        'gender','age','maritalStatus','education','income','coupon_freq','occupation_class'])

X_test_Ordinal_encoding['has_children'] = X_test['has_children']
X_test_Ordinal_encoding['to_Coupon'] = X_test['to_Coupon']

print('X_test_Ordinal_encoding:',X_test_Ordinal_encoding.shape)

X_test_Ordinal_encoding: (2522, 16)


### Frequency Encoding

In [82]:
def frequency_enc(column_name,X):
  """It returns Frequency encoded feature"""
  return X[column_name].map(X.groupby(column_name).size()/len(X))

In [83]:
X_train_frequency_encoding = pd.DataFrame()
for i in range(X_train.shape[1]):
  X_train_frequency_encoding[X_train.columns.values[i]+'_freq_enc'] = frequency_enc(X_train.columns.values[i],X_train)

print('X_train_frequency_encoding:',X_train_frequency_encoding.shape)

X_train_frequency_encoding: (10088, 16)


In [84]:
X_test_frequency_encoding = pd.DataFrame()
for i in range(X_test.shape[1]):
  X_test_frequency_encoding[X_test.columns.values[i]+'_freq_enc'] = frequency_enc(X_test.columns.values[i],X_test)

print('X_test_frequency_encoding:',X_test_frequency_encoding.shape)

X_test_frequency_encoding: (2522, 16)


### Target Encoding

In [85]:
def target_enc(column_name,X):
  """It returns Target encoded feature for train data"""
  X['Y_train'] = y_train
  return X[column_name].map(X.groupby(column_name)['Y_train'].mean())

X_train_target_encoding = pd.DataFrame()
for i in range(X_train.shape[1]):
  X_train_target_encoding[X_train.columns.values[i]+'_target_enc'] = target_enc(X_train.columns.values[i],X_train)

print('X_train_target_encoding:',X_train_target_encoding.shape)

X_train_target_encoding: (10088, 16)


In [86]:
def target_enc(column_name,X):
  """It returns Target encoded feature for test data"""
  X['Y_test'] = y_test
  return X[column_name].map(X.groupby(column_name)['Y_test'].mean())

X_test_target_encoding = pd.DataFrame()
for i in range(X_test.shape[1]):
  X_test_target_encoding[X_test.columns.values[i]+'_target_enc'] = target_enc(X_test.columns.values[i],X_test)

print('X_test_target_encoding:',X_test_target_encoding.shape)

X_test_target_encoding: (2522, 16)


### Response Encoding

In [87]:
# response encoding function
def response_coding(feature,X,Y):
    """It returns Response encoded feature"""
    X[feature] = X[feature].str.replace('~','_')
    X[feature] = X[feature].str.replace('[^a-zA-Z0-9_ ]',' ')
    X[feature] = X[feature].str.replace(' +',' ')
    X[feature] = X[feature].str.strip()
    X[feature] = X[feature].str.replace(' ','_')
    X[feature] = X[feature].str.lower()
    response_code_0 = [];response_code_1 = []
    unique_cat_features = X[feature].unique()
    unique_cat_features = np.sort(unique_cat_features)
    for i in range(len(unique_cat_features)):
        total_count = X[feature][(X[feature] == unique_cat_features[i])].count()
        p0 = (X[feature][((X[feature] == unique_cat_features[i]) & (Y==0))].count())/total_count
        p1 = (X[feature][((X[feature] == unique_cat_features[i]) & (Y==1))].count())/total_count
        response_code_0.append(p0);response_code_1.append(p1)
    dict_response_code_0 = dict(zip(unique_cat_features, response_code_0))
    dict_response_code_1 = dict(zip(unique_cat_features, response_code_1))
    X_response_0 = X[feature].map(dict_response_code_0)
    X_response_1 = X[feature].map(dict_response_code_1)
    X_response_0 = X_response_0.values.reshape(-1,1)
    X_response_1 = X_response_1.values.reshape(-1,1)
    return X_response_0,X_response_1

In [88]:
X_train_destination_0,X_train_destination_1 = response_coding('destination',X_train,y_train)
X_train_passanger_0,X_train_passanger_1 = response_coding('passanger',X_train,y_train)
X_train_weather_0,X_train_weather_1 = response_coding('weather',X_train,y_train)
X_train_time_0,X_train_time_1 = response_coding('time',X_train,y_train)
X_train_coupon_0,X_train_coupon_1 = response_coding('coupon',X_train,y_train)
X_train_expiration_0,X_train_expiration_1 = response_coding('expiration',X_train,y_train)
X_train_gender_0,X_train_gender_1 = response_coding('gender',X_train,y_train)
X_train_age_0,X_train_age_1 = response_coding('age',X_train,y_train)
X_train_maritalStatus_0,X_train_maritalStatus_1 = response_coding('maritalStatus',X_train,y_train)
X_train_education_0,X_train_education_1 = response_coding('education',X_train,y_train)
X_train_income_0,X_train_income_1 = response_coding('income',X_train,y_train)
X_train_coupon_freq_0,X_train_coupon_freq_1 = response_coding('coupon_freq',X_train,y_train)
X_train_occupation_class_0,X_train_occupation_class_1 = response_coding('occupation_class',X_train,y_train)

X_test_destination_0,X_test_destination_1 = response_coding('destination',X_test,y_test)
X_test_passanger_0,X_test_passanger_1 = response_coding('passanger',X_test,y_test)
X_test_weather_0,X_test_weather_1 = response_coding('weather',X_test,y_test)
X_test_time_0,X_test_time_1 = response_coding('time',X_test,y_test)
X_test_coupon_0,X_test_coupon_1 = response_coding('coupon',X_test,y_test)
X_test_expiration_0,X_test_expiration_1 = response_coding('expiration',X_test,y_test)
X_test_gender_0,X_test_gender_1 = response_coding('gender',X_test,y_test)
X_test_age_0,X_test_age_1 = response_coding('age',X_test,y_test)
X_test_maritalStatus_0,X_test_maritalStatus_1 = response_coding('maritalStatus',X_test,y_test)
X_test_education_0,X_test_education_1 = response_coding('education',X_test,y_test)
X_test_income_0,X_test_income_1 = response_coding('income',X_test,y_test)
X_test_coupon_freq_0,X_test_coupon_freq_1 = response_coding('coupon_freq',X_test,y_test)
X_test_occupation_class_0,X_test_occupation_class_1 = response_coding('occupation_class',X_test,y_test)

In [89]:
# Normalization of numerical features
def norm(column_name,X):
    """It returns Normalized feature"""
    normalizer = Normalizer()
    normalizer.fit(X[column_name].values.reshape(1,-1))
    X_norm = normalizer.transform(X[column_name].values.reshape(1,-1))
    return X_norm.reshape(-1,1)

In [90]:
X_train_temperature_norm = norm('temperature',X_train)
X_train_has_children_norm = norm('has_children',X_train)
X_train_to_Coupon_norm = norm('to_Coupon',X_train)

X_test_temperature_norm = norm('temperature',X_test)
X_test_has_children_norm = norm('has_children',X_test)
X_test_to_Coupon_norm = norm('to_Coupon',X_test)

In [91]:
X_train_response_encoding = np.hstack((X_train_destination_0,X_train_destination_1,X_train_passanger_0,X_train_passanger_1,X_train_weather_0,X_train_weather_1,X_train_time_0,X_train_time_1,X_train_coupon_0,X_train_coupon_1,X_train_expiration_0,X_train_expiration_1,X_train_gender_0,X_train_gender_1,X_train_age_0,X_train_age_1,X_train_maritalStatus_0,X_train_maritalStatus_1,X_train_education_0,X_train_education_1,X_train_income_0,X_train_income_1,X_train_coupon_freq_0,X_train_coupon_freq_1,X_train_occupation_class_0,X_train_occupation_class_1,X_train_temperature_norm,X_train_has_children_norm,X_train_to_Coupon_norm))
X_test_response_encoding = np.hstack((X_test_destination_0,X_test_destination_1,X_test_passanger_0,X_test_passanger_1,X_test_weather_0,X_test_weather_1,X_test_time_0,X_test_time_1,X_test_coupon_0,X_test_coupon_1,X_test_expiration_0,X_test_expiration_1,X_test_gender_0,X_test_gender_1,X_test_age_0,X_test_age_1,X_test_maritalStatus_0,X_test_maritalStatus_1,X_test_education_0,X_test_education_1,X_test_income_0,X_test_income_1,X_test_coupon_freq_0,X_test_coupon_freq_1,X_test_occupation_class_0,X_test_occupation_class_1,X_test_temperature_norm,X_test_has_children_norm,X_test_to_Coupon_norm))
print('X_train_response_encoding:',X_train_response_encoding.shape,'\nX_test_response_encoding:',X_test_response_encoding.shape)

X_train_response_encoding: (10088, 29) 
X_test_response_encoding: (2522, 29)


### One Hot Encoding

In [92]:
# one hot encoding function
def ohe(column_name,X):
    """It returns One hot encoded feature in X data"""  
    X[column_name] = X[column_name].str.replace('~','_')
    X[column_name] = X[column_name].str.replace('[^a-zA-Z0-9_ ]',' ')
    X[column_name] = X[column_name].str.replace(' +',' ')
    X[column_name] = X[column_name].str.strip()
    X[column_name] = X[column_name].str.replace(' ','_')
    X[column_name] = X[column_name].str.lower()
    vectorizer = CountVectorizer(binary=True)
    return vectorizer.fit_transform(X[column_name].values)


In [93]:
X_train_destination_ohe = ohe('destination',X_train)
X_train_passanger_ohe = ohe('passanger',X_train)
X_train_weather_ohe = ohe('weather',X_train)
X_train_time_ohe = ohe('time',X_train)
X_train_coupon_ohe = ohe('coupon',X_train)
X_train_expiration_ohe = ohe('expiration',X_train)
X_train_gender_ohe = ohe('gender',X_train)
X_train_age_ohe = ohe('age',X_train)
X_train_maritalStatus_ohe = ohe('maritalStatus',X_train)
X_train_education_ohe = ohe('education',X_train)
X_train_income_ohe = ohe('income',X_train)
X_train_coupon_freq_ohe = ohe('coupon_freq',X_train)
X_train_occupation_class_ohe = ohe('occupation_class',X_train)

X_test_destination_ohe = ohe('destination',X_test)
X_test_passanger_ohe = ohe('passanger',X_test)
X_test_weather_ohe = ohe('weather',X_test)
X_test_time_ohe = ohe('time',X_test)
X_test_coupon_ohe = ohe('coupon',X_test)
X_test_expiration_ohe = ohe('expiration',X_test)
X_test_gender_ohe = ohe('gender',X_test)
X_test_age_ohe = ohe('age',X_test)
X_test_maritalStatus_ohe = ohe('maritalStatus',X_test)
X_test_education_ohe = ohe('education',X_test)
X_test_income_ohe = ohe('income',X_test)
X_test_coupon_freq_ohe = ohe('coupon_freq',X_test)
X_test_occupation_class_ohe = ohe('occupation_class',X_test)

In [94]:
# Normalization of numerical features
def norm(column_name,X):
    """It returns Normalized feature"""
    normalizer = Normalizer()
    normalizer.fit(X[column_name].values.reshape(1,-1))
    X_norm = normalizer.transform(X[column_name].values.reshape(1,-1))
    return X_norm.reshape(-1,1)

In [95]:
X_train_temperature_norm = norm('temperature',X_train)
X_train_has_children_norm = norm('has_children',X_train)
X_train_to_Coupon_norm = norm('to_Coupon',X_train)

X_test_temperature_norm = norm('temperature',X_test)
X_test_has_children_norm = norm('has_children',X_test)
X_test_to_Coupon_norm = norm('to_Coupon',X_test)

In [96]:
from scipy.sparse import hstack
X_train_ohe = hstack((X_train_destination_ohe, X_train_passanger_ohe, X_train_weather_ohe, X_train_time_ohe, X_train_coupon_ohe, X_train_expiration_ohe, X_train_gender_ohe, X_train_age_ohe, X_train_maritalStatus_ohe, X_train_education_ohe, X_train_income_ohe, X_train_coupon_freq_ohe, X_train_occupation_class_ohe, X_train_temperature_norm, X_train_has_children_norm, X_train_to_Coupon_norm)).tocsr()
X_test_ohe = hstack((X_test_destination_ohe, X_test_passanger_ohe, X_test_weather_ohe, X_test_time_ohe, X_test_coupon_ohe, X_test_expiration_ohe, X_test_gender_ohe, X_test_age_ohe, X_test_maritalStatus_ohe, X_test_education_ohe, X_test_income_ohe, X_test_coupon_freq_ohe, X_test_occupation_class_ohe, X_test_temperature_norm, X_test_has_children_norm, X_test_to_Coupon_norm)).tocsr()
print('X_train_ohe:',X_train_ohe.shape,'\nX_test_ohe:',X_test_ohe.shape)

X_train_ohe: (10088, 65) 
X_test_ohe: (2522, 65)
