## 1. One hot encoding on all the values
## 2. One hot encoding on top 10 values
## 3. Replacing the variables with their count
## 4. Ordinal Number Encoding
## 5. Target Guided Ordinal Encoding
## 6. Mean Encoding
## 7. Probablity Ratio Encoding

## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## One Hot Encoding for Categorical data 

In [None]:
data = pd.read_csv('../input/mercedesbenz-greener-manufacturing/train.csv',usecols=['X1','X2','X3','X4','X5','X6'])
data.head()

In [None]:
#To check the Unique values
for i in data.columns:
    print(i,' : ',len(data[i].unique()), 'labels')

# 1. One hot encoding on all the values

In [None]:
#If we perform one hot encoding, lets see how many columns will be generated
pd.get_dummies(data,drop_first =True ).shape

#Its says it will generate 117 columns

# 2. One hot encoding on top 10 values


### Apply one hot encoding on top 10 values of each column (Showing for X2 first)

In [None]:
#Finding top 10 most frequent values of X2
a = data['X2'].value_counts()
b = a.sort_values(ascending = False)
b.head(10)
#OR
#data['X2'].value_counts().sort_values(ascending = False).head(10)

In [None]:
#Making a list of the index values of top 10 frequent values
l = []
for i in data['X2'].value_counts().sort_values(ascending = False).head(10).index:
    l.append(i)
print(l)
#OR
#l = [i for i in data['X2'].value_counts().sort_values(ascending = False).head(10).index]


In [None]:
# get whole set of dummy variables, for all the categorical variables
def one_hot_encoding_top_x(data, variable, l):
    # function to create the dummy variables for the most frequent labels
    # we can vary the number of most frequent labels that we encode
    for label in l:
        data[label] = np.where(data[variable]==label,1,0)
one_hot_encoding_top_x(data, 'X2', l)
data.head(20)

In [None]:
#You can now do the same thing for all the other columns
#After completing the for all the columns, drop the initial columns (X1,X2,X3,X4,X5,X6)

### Advantages of one hot encoding on top variables

* Does not require hours of variable exploration
* Straightforward Implementation
* Does not expand the no. of columns massively

### Disadvantages

* Does not add any information that makes the variables more predictable
* Does not keep the information of the ignored variables

# 3. Replacing the variables with their count

In [None]:
df = pd.read_csv('../input/mercedesbenz-greener-manufacturing/train.csv',usecols=['X1','X2'])
df.head()

In [None]:
#Take the count of unique values and convert into a dictionary
dict_var = df.X2.value_counts().to_dict()
dict_var

In [None]:
df['X2'].head()

In [None]:
df.X2 = df.X2.map(dict_var)
df

## Advantages

* Easy to implement
* Does not increase the feature dimension size (columns)

## Disadvantages 

* If the labels have same count, they will get the same values. It may lead to important information loss

# 4. Ordinal Number Encoding

In [None]:
import datetime

In [None]:
#Today's date
today = datetime.datetime.today()
today

In [None]:
#Difference between today's date and the no. of days mentioned(In our case - 2)
today-datetime.timedelta(2)

In [None]:
#Taking 15 days data in a list comprehension
days = [today-datetime.timedelta(x) for x in range(15)]

In [None]:
data = pd.DataFrame(days)
data.columns = ['Day']

In [None]:
data

In [None]:
#To get the day's value
data['Weekday']=data['Day'].dt.strftime("%A")

In [None]:
data.head()

In [None]:
#Creating a dictionary to give ranks to each day
dict = {'Moday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6,'Sunday':7}
#Creating a new column and appending the ranks based on the day
data['ordinal_rank'] = data['Weekday'].map(dict)

In [None]:
data.head()

# 5. Target Guided Ordinal Encoding
1. Ordering the labels according to the target
2. Replace the labels by the joint probability of being 1 or 0

In [None]:
df=pd.read_csv('../input/titanic/train.csv', usecols=['Cabin','Survived'])
df.head()

In [None]:
#Filled the missing values with the word 'Missing'
df['Cabin'].fillna('Missing',inplace=True)

In [None]:
#Taking only the first letter of the word (For eg if C85 then only C will be taken)
df['Cabin']=df['Cabin'].astype(str).str[0]

In [None]:
df.head()

In [None]:
#Checking the unique values in cabin column now
df.Cabin.unique()

In [None]:
#Taking the mean of the Cabin value along with the survived column to check for the percentages
df.groupby(['Cabin'])['Survived'].mean()

In [None]:
#Taking the index of Cabin column
df.groupby(['Cabin'])['Survived'].mean().sort_values().index

In [None]:
#Adding the sorted values in ordinal_labels
ordinal_labels=df.groupby(['Cabin'])['Survived'].mean().sort_values().index
ordinal_labels

In [None]:
#Assigning ranks as per the sorted percentage
#The ranks range from 0 to length of ordinal_labels
ordinal_labels2={k:i for i,k in enumerate(ordinal_labels,0)}
ordinal_labels2

In [None]:
#Creating a new column containing ranks
df['Cabin_ordinal_labels']=df['Cabin'].map(ordinal_labels2)
df.head()

# 6. Mean Encoding

In [None]:
df=pd.read_csv('../input/titanic/train.csv', usecols=['Cabin','Survived'])
df.head()

In [None]:
#Filled the missing values with the word 'Missing'
df['Cabin'].fillna('Missing',inplace=True)

In [None]:
#Taking only the first letter of the word (For eg if C85 then only C will be taken)
df['Cabin']=df['Cabin'].astype(str).str[0]

In [None]:
#Checking the unique values in cabin column now
df.Cabin.unique()

In [None]:
#Take the mean and store the values in dict
mean_ordinal=df.groupby(['Cabin'])['Survived'].mean().to_dict()
mean_ordinal

In [None]:
#Creating a new column and appending the values according to the mean
df['mean_ordinal_encode']=df['Cabin'].map(mean_ordinal)
df.head()

# 7. Probablity Ratio Encoding

In [None]:
df=pd.read_csv('../input/titanic/train.csv', usecols=['Cabin','Survived'])
df.head()

In [None]:
#Filled the missing values with the word 'Missing'
df['Cabin'].fillna('Missing',inplace=True)

In [None]:
#Taking only the first letter of the word (For eg if C85 then only C will be taken)
df['Cabin']=df['Cabin'].astype(str).str[0]

In [None]:
#Checking the unique values in cabin column now
df.Cabin.unique()

In [None]:
#Create a varible to take the mean of survived group by cabin
mean_prob=df.groupby(['Cabin'])['Survived'].mean()
mean_prob

In [None]:
#Creating a new dataframe and storing the above data
a = pd.DataFrame(mean_prob)
a

In [None]:
#Creating the Died Column
a['Died']=1-a['Survived']

In [None]:
a.head()

In [None]:
#Creating a column and appending the calculated probablity values
a['Prob'] = a['Survived']/a['Died']

In [None]:
a.head()

In [None]:
#Creating a new variable and passing the Prob column of dataset a into a dictionary
probablity_encoded = a.Prob.to_dict()

In [None]:
#Creating a new column in original dataframe (df) and adding the probablity values using map function
df['Cabin_encoded'] = df['Cabin'].map(probablity_encoded)

In [None]:
df.head()