## Encoding:
   To convert the categorical value into numerical value for ML algorithm.
### Types of Encoding Methods
   * OneHotEncoding (from a sklearn library)
   * OneHotEncoding With Many Categorical Variables
   * Ordinal Encoding
   * Count/Frequency Encoding
   * LabelEncoding (from a sklearn library)
   * Mean/Target Encoding
   * Target Guided Ordinal Encoding
   * Probability Ratio Encoding

### OneHotEncoding With Many Categorical Variables

In [1]:
import numpy as np
import pandas as pd

df =  pd.read_csv('pokemon_data.csv')
df.head()

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,39,52,43,60,50,65,1,False


In [2]:
df.Legendary.value_counts()

False    735
True      65
Name: Legendary, dtype: int64

In [3]:
df = df.drop(['#'], axis=1)

In [4]:
numerical_features = [feature for  feature in df.columns if df[feature].dtypes != 'O']
print(len(numerical_features))
numerical_features

8


['HP',
 'Attack',
 'Defense',
 'Sp. Atk',
 'Sp. Def',
 'Speed',
 'Generation',
 'Legendary']

#### Finding Categorical Values

In [5]:
categorical_features = [feature for feature in df.columns if feature not in numerical_features ]
print(len(categorical_features))
categorical_features

3


['Name', 'Type 1', 'Type 2']

#### Create dataframe only for the categorical variables

In [6]:
new_df = pd.read_csv('pokemon_data.csv',usecols=['Name', 'Type 1', 'Type 2'])
new_df.head()

Unnamed: 0,Name,Type 1,Type 2
0,Bulbasaur,Grass,Poison
1,Ivysaur,Grass,Poison
2,Venusaur,Grass,Poison
3,VenusaurMega Venusaur,Grass,Poison
4,Charmander,Fire,


In [7]:
# printing the unique values in the new df
for value in new_df.columns:
    print(f'{value} : {len(new_df[value].unique())}')

Name : 800
Type 1 : 18
Type 2 : 19


#### Count of Unique Values

In [8]:
new_df['Type 1'].value_counts().sort_values(ascending=False)

Water       112
Normal       98
Grass        70
Bug          69
Psychic      57
Fire         52
Electric     44
Rock         44
Ghost        32
Ground       32
Dragon       32
Dark         31
Poison       28
Steel        27
Fighting     27
Ice          24
Fairy        17
Flying        4
Name: Type 1, dtype: int64

#### List Of Most Frequent Categorical Values

In [9]:
top_10 = [x for x in new_df['Type 1'].value_counts().sort_values(ascending=False).head(10).index]
top_10

['Water',
 'Normal',
 'Grass',
 'Bug',
 'Psychic',
 'Fire',
 'Electric',
 'Rock',
 'Ghost',
 'Ground']

#### Make Binary

In [10]:
for value in top_10:
    new_df[value] = np.where(new_df['Type 1'] == value, 1, 0)
new_df[['Type 1']+top_10]

Unnamed: 0,Type 1,Water,Normal,Grass,Bug,Psychic,Fire,Electric,Rock,Ghost,Ground
0,Grass,0,0,1,0,0,0,0,0,0,0
1,Grass,0,0,1,0,0,0,0,0,0,0
2,Grass,0,0,1,0,0,0,0,0,0,0
3,Grass,0,0,1,0,0,0,0,0,0,0
4,Fire,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
795,Rock,0,0,0,0,0,0,0,1,0,0
796,Rock,0,0,0,0,0,0,0,1,0,0
797,Psychic,0,0,0,0,1,0,0,0,0,0
798,Psychic,0,0,0,0,1,0,0,0,0,0


In [11]:
def top_x(dataframe, columns, top_x_lables):
    for label in top_x_lables:
        dataframe[columns+'_'+label] = np.where(data[columns]==label, 1, 0)
data = pd.read_csv('pokemon_data.csv', usecols=['Name','Type 1','Type 2'])
top_x(data, 'Type 2', top_10)

In [12]:
data.head()

Unnamed: 0,Name,Type 1,Type 2,Type 2_Water,Type 2_Normal,Type 2_Grass,Type 2_Bug,Type 2_Psychic,Type 2_Fire,Type 2_Electric,Type 2_Rock,Type 2_Ghost,Type 2_Ground
0,Bulbasaur,Grass,Poison,0,0,0,0,0,0,0,0,0,0
1,Ivysaur,Grass,Poison,0,0,0,0,0,0,0,0,0,0
2,Venusaur,Grass,Poison,0,0,0,0,0,0,0,0,0,0
3,VenusaurMega Venusaur,Grass,Poison,0,0,0,0,0,0,0,0,0,0
4,Charmander,Fire,,0,0,0,0,0,0,0,0,0,0


In [13]:
top_x(data, 'Type 1', top_10)
data.head()

Unnamed: 0,Name,Type 1,Type 2,Type 2_Water,Type 2_Normal,Type 2_Grass,Type 2_Bug,Type 2_Psychic,Type 2_Fire,Type 2_Electric,...,Type 1_Water,Type 1_Normal,Type 1_Grass,Type 1_Bug,Type 1_Psychic,Type 1_Fire,Type 1_Electric,Type 1_Rock,Type 1_Ghost,Type 1_Ground
0,Bulbasaur,Grass,Poison,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,Ivysaur,Grass,Poison,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,Venusaur,Grass,Poison,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,VenusaurMega Venusaur,Grass,Poison,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,Charmander,Fire,,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


## Ordinal Encoding
   * Replacing categorical feature with rank order

In [14]:
import datetime

time = datetime.datetime.today()
time

datetime.datetime(2021, 9, 12, 13, 15, 22, 79438)

In [15]:
date = {time-datetime.timedelta(x) for x in range(0,15)}
date

{datetime.datetime(2021, 8, 29, 13, 15, 22, 79438),
 datetime.datetime(2021, 8, 30, 13, 15, 22, 79438),
 datetime.datetime(2021, 8, 31, 13, 15, 22, 79438),
 datetime.datetime(2021, 9, 1, 13, 15, 22, 79438),
 datetime.datetime(2021, 9, 2, 13, 15, 22, 79438),
 datetime.datetime(2021, 9, 3, 13, 15, 22, 79438),
 datetime.datetime(2021, 9, 4, 13, 15, 22, 79438),
 datetime.datetime(2021, 9, 5, 13, 15, 22, 79438),
 datetime.datetime(2021, 9, 6, 13, 15, 22, 79438),
 datetime.datetime(2021, 9, 7, 13, 15, 22, 79438),
 datetime.datetime(2021, 9, 8, 13, 15, 22, 79438),
 datetime.datetime(2021, 9, 9, 13, 15, 22, 79438),
 datetime.datetime(2021, 9, 10, 13, 15, 22, 79438),
 datetime.datetime(2021, 9, 11, 13, 15, 22, 79438),
 datetime.datetime(2021, 9, 12, 13, 15, 22, 79438)}

In [16]:
df = pd.DataFrame(date)
df.columns=['Days']
df.head()

Unnamed: 0,Days
0,2021-09-10 13:15:22.079438
1,2021-09-01 13:15:22.079438
2,2021-08-31 13:15:22.079438
3,2021-09-03 13:15:22.079438
4,2021-09-05 13:15:22.079438


In [17]:
df['Weekday'] = df['Days'].dt.day_name()
df.head()

Unnamed: 0,Days,Weekday
0,2021-09-10 13:15:22.079438,Friday
1,2021-09-01 13:15:22.079438,Wednesday
2,2021-08-31 13:15:22.079438,Tuesday
3,2021-09-03 13:15:22.079438,Friday
4,2021-09-05 13:15:22.079438,Sunday


In [18]:
df.Weekday.unique()

array(['Friday', 'Wednesday', 'Tuesday', 'Sunday', 'Saturday', 'Monday',
       'Thursday'], dtype=object)

In [19]:
dict_weekday = {'Monday': 0,
                'Tuesday': 1,
                'Wednesday': 2,
                'Thursday': 3,
                'Friday': 4,
                'Saturday': 5,
                'Sunday': 6}

In [20]:
df['Weekday_ordinal'] = df['Weekday'].map(dict_weekday)

In [21]:
df.head()

Unnamed: 0,Days,Weekday,Weekday_ordinal
0,2021-09-10 13:15:22.079438,Friday,4
1,2021-09-01 13:15:22.079438,Wednesday,2
2,2021-08-31 13:15:22.079438,Tuesday,1
3,2021-09-03 13:15:22.079438,Friday,4
4,2021-09-05 13:15:22.079438,Sunday,6


## Count or Frequency encoding
   * Replace the categorical value with count value by count the unique values 

In [22]:
df =  pd.read_csv('pokemon_data.csv', usecols=['Type 1'])
df.head()

Unnamed: 0,Type 1
0,Grass
1,Grass
2,Grass
3,Grass
4,Fire


In [23]:
# counting the value then convert those value into dict
val_count = df['Type 1'].value_counts().to_dict()
val_count

{'Water': 112,
 'Normal': 98,
 'Grass': 70,
 'Bug': 69,
 'Psychic': 57,
 'Fire': 52,
 'Electric': 44,
 'Rock': 44,
 'Dragon': 32,
 'Ground': 32,
 'Ghost': 32,
 'Dark': 31,
 'Poison': 28,
 'Steel': 27,
 'Fighting': 27,
 'Ice': 24,
 'Fairy': 17,
 'Flying': 4}

In [24]:
# mapping dict value  into a new categorical feature or replacing with same columns
df['Type 1'] = df['Type 1'].map(val_count)
df.head()

Unnamed: 0,Type 1
0,70
1,70
2,70
3,70
4,52


## Target Guided ordinal Encoding
   * Replacing categorical feature by taking mean of the target variable then generate rank label to that value

In [25]:
df = pd.read_csv('titanic.csv', usecols=['Cabin', 'Survived'])
df.head()

Unnamed: 0,Cabin,Survived
0,,0
1,C85,1
2,,1
3,C123,1
4,,0


###### We saw in this method in replacing missing categorical value by string('Missing') or anthing you want

In [26]:
df.Cabin = df.Cabin.fillna('Missing')
df.head()

Unnamed: 0,Cabin,Survived
0,Missing,0
1,C85,1
2,Missing,1
3,C123,1
4,Missing,0


In [27]:
df['Cabin'] = df['Cabin'].astype(str).str[0]

In [28]:
df.head()

Unnamed: 0,Cabin,Survived
0,M,0
1,C,1
2,M,1
3,C,1
4,M,0


In [29]:
mean = df.groupby(['Cabin'])['Survived'].mean().sort_values(ascending=True)
mean

Cabin
T    0.000000
M    0.299854
A    0.466667
G    0.500000
C    0.593220
F    0.615385
B    0.744681
E    0.750000
D    0.757576
Name: Survived, dtype: float64

In [30]:
label = df.groupby(['Cabin'])['Survived'].mean().sort_values(ascending=True).index
label

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [31]:
encoding = {j:i for i,j in enumerate(label, 0)}
encoding

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [32]:
df['Cabin_targetEncod'] = df['Cabin'].map(encoding)
df.head()

Unnamed: 0,Cabin,Survived,Cabin_targetEncod
0,M,0,1
1,C,1,4
2,M,1,1
3,C,1,4
4,M,0,1


## Mean / Target encoding
   * Its same method compare to Target Guided Ordinal Encoding 
   * But dont provide rank for a label, replacing with mean value 

In [33]:
df = pd.read_csv('titanic.csv', usecols=['Cabin', 'Survived'])
df.head()

Unnamed: 0,Cabin,Survived
0,,0
1,C85,1
2,,1
3,C123,1
4,,0


In [34]:
df.Cabin = df.Cabin.fillna('Missing')
df.head()

Unnamed: 0,Cabin,Survived
0,Missing,0
1,C85,1
2,Missing,1
3,C123,1
4,Missing,0


In [35]:
df['Cabin'] = df['Cabin'].astype(str).str[0]
df.head()

Unnamed: 0,Cabin,Survived
0,M,0
1,C,1
2,M,1
3,C,1
4,M,0


In [36]:
# to dict this value
mean = df.groupby(['Cabin'])['Survived'].mean().sort_values(ascending=True).to_dict()
mean

{'T': 0.0,
 'M': 0.29985443959243085,
 'A': 0.4666666666666667,
 'G': 0.5,
 'C': 0.5932203389830508,
 'F': 0.6153846153846154,
 'B': 0.7446808510638298,
 'E': 0.75,
 'D': 0.7575757575757576}

In [37]:
df['Cabin_MeanEncoding'] = df['Cabin'].map(mean)
df.head()

Unnamed: 0,Cabin,Survived,Cabin_MeanEncoding
0,M,0,0.299854
1,C,1,0.59322
2,M,1,0.299854
3,C,1,0.59322
4,M,0,0.299854


## Probability ratio encoding
   * Probability of Survived based on Cabin---> Categorical Feature
   * Probability of Not Survived---> 1-probability(Survived)
   * probability(Survived)/probability(Not Survived)
   * Dictonary to map cabin with probability
   * replace with the categorical feature

In [38]:
df = pd.read_csv('titanic.csv', usecols=['Cabin', 'Survived'])
df.head()

Unnamed: 0,Cabin,Survived
0,,0
1,C85,1
2,,1
3,C123,1
4,,0


In [39]:
# Get the unique label in a features
df['Cabin'].unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [40]:
# replacing nan value with missing
df['Cabin'] = df['Cabin'].fillna('Missing')
df.head()

Unnamed: 0,Cabin,Survived
0,Missing,0
1,C85,1
2,Missing,1
3,C123,1
4,Missing,0


In [41]:
# extract the string value with 1st letter 
df['Cabin'] = df['Cabin'].astype(str).str[0]
df.head()

Unnamed: 0,Cabin,Survived
0,M,0
1,C,1
2,M,1
3,C,1
4,M,0


In [42]:
# For Cabin, Get the mean of survived
df.groupby(['Cabin'])['Survived'].mean()

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [43]:
# Mapping the mean value with categorical feature
mean = df.groupby(['Cabin'])['Survived'].mean()
df['Cabin'] = df.Cabin.map(mean)

In [44]:
df.head()

Unnamed: 0,Cabin,Survived
0,0.299854,0
1,0.59322,1
2,0.299854,1
3,0.59322,1
4,0.299854,0


In [45]:
# create a cabin_death feature for finding probabiity (1-survived you get died probability)
df['Cabin_death'] = 1-df['Cabin']
df.head()

Unnamed: 0,Cabin,Survived,Cabin_death
0,0.299854,0,0.700146
1,0.59322,1,0.40678
2,0.299854,1,0.700146
3,0.59322,1,0.40678
4,0.299854,0,0.700146


In [46]:
# then find the probability by dividing survived/death and replacing with categorical values
df['Cabin'] = df['Cabin']/df['Cabin_death']
df.drop(columns='Cabin_death', inplace=True)
df.head(10)

Unnamed: 0,Cabin,Survived
0,0.428274,0
1,1.458333,1
2,0.428274,1
3,1.458333,1
4,0.428274,0
5,0.428274,0
6,3.0,0
7,0.428274,0
8,0.428274,1
9,0.428274,1
