In [60]:
# importing required libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

# To display maximum columns of dataframe on screen
pd.pandas.set_option('display.max_columns', None)

### 1.0 Importing dataset after filling missing values

In [61]:
dataset=pd.read_csv('Travel_missing_filled.csv')
dataset.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,36.0,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [4]:
dataset.isnull().sum()

CustomerID                  0
ProdTaken                   0
Age                         0
TypeofContact               0
CityTier                    0
DurationOfPitch             0
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups           0
ProductPitched              0
PreferredPropertyStar       0
MaritalStatus               0
NumberOfTrips               0
Passport                    0
PitchSatisfactionScore      0
OwnCar                      0
NumberOfChildrenVisiting    0
Designation                 0
MonthlyIncome               0
dtype: int64

### Observation
1. No null values present
2. we can move for encoding categorical variables

### 2.0 Getting no of categories in each feature

In [5]:
categorical_features=[feature for feature in dataset.columns if dataset[feature].dtypes=='O' and feature not in ['CustomerID']]
for feature in categorical_features:
    print("The feature is '{}' and number of categories are '{}'".format(feature, len(dataset[feature].unique())))

The feature is 'TypeofContact' and number of categories are '3'
The feature is 'Occupation' and number of categories are '4'
The feature is 'Gender' and number of categories are '3'
The feature is 'ProductPitched' and number of categories are '5'
The feature is 'MaritalStatus' and number of categories are '4'
The feature is 'Designation' and number of categories are '5'


### 3.0 Encoding categorical features

### 3.1 Binary one hot encoding

1. In one-hot encoding, we represent a categorical variable as a group of binary variables, where each binary variable represents one category. The binary variable indicates whether the category is present in an observation (1) or not (0).

In [9]:
#encoding all of the categorical variables into k binaries each, capturing the result in a new dataframe
data1 = dataset.copy()

encoded_binary=pd.get_dummies(data1[categorical_features])
encoded_binary.head()
    

Unnamed: 0,TypeofContact_Company Invited,TypeofContact_Self Enquiry,TypeofContact_value,Occupation_Free Lancer,Occupation_Large Business,Occupation_Salaried,Occupation_Small Business,Gender_Fe Male,Gender_Female,Gender_Male,ProductPitched_Basic,ProductPitched_Deluxe,ProductPitched_King,ProductPitched_Standard,ProductPitched_Super Deluxe,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,MaritalStatus_Unmarried,Designation_AVP,Designation_Executive,Designation_Manager,Designation_Senior Manager,Designation_VP
0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0
1,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0
2,0,1,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0
3,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0
4,0,1,0,0,0,0,1,0,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0


### 3.2 one-hot encoding of frequent categories
1. One-hot encoding represents each category of a categorical variable with a binary variable. Hence, one-hot encoding of highly cardinal variables or datasets with multiple categorical features can expand the feature space dramatically. To reduce the number of binary variables, we can perform one-hot encoding of the most frequent categories only. One-hot encoding of top categories is equivalent to treating the remaining, less frequent categories as a single, unique category

In [36]:
data1 = dataset.copy()

# checking all unique categories in continuous features
for feature in categorical_features:
    print(data1[feature].unique())

['Self Enquiry' 'Company Invited' 'value']
['Salaried' 'Free Lancer' 'Small Business' 'Large Business']
['Female' 'Male' 'Fe Male']
['Deluxe' 'Basic' 'Standard' 'Super Deluxe' 'King']
['Single' 'Divorced' 'Married' 'Unmarried']
['Manager' 'Executive' 'Senior Manager' 'AVP' 'VP']


In [21]:
# getting count of observations per unique category

for feature in categorical_features:
    print(data1[feature].value_counts().sort_values(ascending=False).head())


Self Enquiry       3444
Company Invited    1419
value                25
Name: TypeofContact, dtype: int64
Salaried          2368
Small Business    2084
Large Business     434
Free Lancer          2
Name: Occupation, dtype: int64
Male       2916
Female     1817
Fe Male     155
Name: Gender, dtype: int64
Basic           1842
Deluxe          1732
Standard         742
Super Deluxe     342
King             230
Name: ProductPitched, dtype: int64
Married      2340
Divorced      950
Single        916
Unmarried     682
Name: MaritalStatus, dtype: int64
Executive         1842
Manager           1732
Senior Manager     742
AVP                342
VP                 230
Name: Designation, dtype: int64


In [23]:
## lets do encoding

for feature in categorical_features:
    # capturing most frequent categories in code
    all_cat=[cat for cat in data1[feature].value_counts().sort_values(ascending=False).head().index]
    # adding a binary variable for each category
    for cat in all_cat:
        data1[feature+'_'+cat]=np.where(data1[feature]==cat,1,0)

data1.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome,TypeofContact_Self Enquiry,TypeofContact_Company Invited,TypeofContact_value,Occupation_Salaried,Occupation_Small Business,Occupation_Large Business,Occupation_Free Lancer,Gender_Male,Gender_Female,Gender_Fe Male,ProductPitched_Basic,ProductPitched_Deluxe,ProductPitched_Standard,ProductPitched_Super Deluxe,ProductPitched_King,MaritalStatus_Married,MaritalStatus_Divorced,MaritalStatus_Single,MaritalStatus_Unmarried,Designation_Executive,Designation_Manager,Designation_Senior Manager,Designation_AVP,Designation_VP
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0,0,1,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0
4,200004,0,36.0,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0


### 3.3 Replacing categories with ordinal numbers
1. Ordinal encoding consists of replacing the categories with digits from 1 to k, where k is the number of distinct categories of the variable. The numbers are assigned arbitrarily. Ordinal encoding is better suited for nonlinear machine learning models, which can navigate through the arbitrarily assigned digits to try and find patterns that relate to the target.

In [37]:
data1=dataset.copy()

In [38]:
for feature in categorical_features:
    ordinal_mapping_dict= {k: i for i, k in enumerate(data1[feature].unique(), 0) }
    data1[feature]=data1[feature].map(ordinal_mapping_dict)
    
data1.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,0,3,6.0,0,0,3,3.0,0,3.0,0,1.0,1,2,1,0.0,0,20993.0
1,200001,0,49.0,1,1,14.0,0,1,3,4.0,0,4.0,1,2.0,0,3,1,2.0,0,20130.0
2,200002,1,37.0,0,1,8.0,1,1,3,4.0,1,3.0,0,7.0,1,3,0,0.0,1,17090.0
3,200003,0,33.0,1,1,9.0,0,0,2,3.0,1,3.0,1,2.0,1,5,1,1.0,1,17909.0
4,200004,0,36.0,0,1,8.0,2,1,2,3.0,1,4.0,1,1.0,0,5,1,0.0,1,18468.0


### 3.4 Replacing categories with counts or frequency of observations
1. In count or frequency encoding, we replace the categories with the count or the percentage of observations with that category. That is, if 10 out of 100 observations show the category blue for the variable color, we would replace blue with 10 when doing count encoding, or by 0.1 if performing frequency encoding. These techniques, which capture the representation of each label in a dataset, are very popular in data science competitions. The assumption is that the number of observations per category is somewhat predictive of the target.

### 3.4.1 Replacing with counts

In [39]:
data1=dataset.copy()

for feature in categorical_features:
    count_dict = data1[feature].value_counts().to_dict()
    data1[feature]=data1[feature].map(count_dict)
    
data1.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,3444,3,6.0,2368,1817,3,3.0,1732,3.0,916,1.0,1,2,1,0.0,1732,20993.0
1,200001,0,49.0,1419,1,14.0,2368,2916,3,4.0,1732,4.0,950,2.0,0,3,1,2.0,1732,20130.0
2,200002,1,37.0,3444,1,8.0,2,2916,3,4.0,1842,3.0,916,7.0,1,3,0,0.0,1842,17090.0
3,200003,0,33.0,1419,1,9.0,2368,1817,2,3.0,1842,3.0,950,2.0,1,5,1,1.0,1842,17909.0
4,200004,0,36.0,3444,1,8.0,2084,2916,2,3.0,1842,4.0,950,1.0,0,5,1,0.0,1842,18468.0


### 3.4.2 Replacing with frequency of observations

In [41]:
data1=dataset.copy()

for feature in categorical_features:
    count_dict = round((data1[feature].value_counts()/len(data1)),2).to_dict()
    data1[feature]=data1[feature].map(count_dict)
    
data1.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,0.7,3,6.0,0.48,0.37,3,3.0,0.35,3.0,0.19,1.0,1,2,1,0.0,0.35,20993.0
1,200001,0,49.0,0.29,1,14.0,0.48,0.6,3,4.0,0.35,4.0,0.19,2.0,0,3,1,2.0,0.35,20130.0
2,200002,1,37.0,0.7,1,8.0,0.0,0.6,3,4.0,0.38,3.0,0.19,7.0,1,3,0,0.0,0.38,17090.0
3,200003,0,33.0,0.29,1,9.0,0.48,0.37,2,3.0,0.38,3.0,0.19,2.0,1,5,1,1.0,0.38,17909.0
4,200004,0,36.0,0.7,1,8.0,0.43,0.6,2,3.0,0.38,4.0,0.19,1.0,0,5,1,0.0,0.38,18468.0


**Note:** if two different categories are present in the same percentage ofobservations, they will be replaced by the same value, which may lead to information loss.

### 3.5 Grouping rare or infrequent categories
1. Rare values are those categories that are present only in a small percentage of the observations. There is no rule of thumb to determine how small is a small percentage, but typically, any value below 5 % can be considered rare. Infrequent labels often appear only on the train set or only on the test set, therefore making the algorithms prone to overfitting or unable to score an observation. To avoid these complications, we can group infrequent categories into a new category called Rare or Other.

In [44]:
# function to get list of rare categories with certain tolerance

def frequent_label(in_data, in_feature, tolerance):
    temp = in_data[in_feature].value_counts()/len(in_data)
    frequent_cat=[obs for obs in temp.loc[temp>tolerance].index.values]
    return frequent_cat

In [57]:
data1=dataset.copy()

for feature in categorical_features:
    frequent_cat=frequent_label(data1, feature, 0.05)
    data1[feature]=np.where(data1[feature].isin(frequent_cat), data1[feature], 'Rare')
    
    ### now lets encode all features
    all_cat=[cat for cat in data1[feature].value_counts().sort_values(ascending=False).head().index]
    # adding a binary variable for each category
    for cat in all_cat:
        data1[feature+'_'+cat]=np.where(data1[feature]==cat,1,0)

# checking if the encoding is successful
data1.head()
    

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome,TypeofContact_Self Enquiry,TypeofContact_Company Invited,TypeofContact_Rare,Occupation_Salaried,Occupation_Small Business,Occupation_Large Business,Occupation_Rare,Gender_Male,Gender_Female,Gender_Rare,ProductPitched_Basic,ProductPitched_Deluxe,ProductPitched_Standard,ProductPitched_Super Deluxe,ProductPitched_Rare,MaritalStatus_Married,MaritalStatus_Divorced,MaritalStatus_Single,MaritalStatus_Unmarried,Designation_Executive,Designation_Manager,Designation_Senior Manager,Designation_AVP,Designation_Rare
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0
2,200002,1,37.0,Self Enquiry,1,8.0,Rare,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0,0,1,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0
4,200004,0,36.0,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0


In [58]:
# exporting this dataset to csv
data1.to_csv('Travel_missing_filled_encoded.csv', index=False)