## One hot encoding
This transforms each categorical feature with `n_categories` possible values into `n_categories binary features`, with one of them `1, and all others 0`.

## 1) OneHotEncoding Using `sklearn.preprocessing.OneHotEncoder`

#### `sklearn.preprocessing.OneHotEncoder` Encode categorical integer features as a one-hot numeric array

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

In [36]:
from sklearn.preprocessing import OneHotEncoder

import pandas as pd
import numpy as np

### Creating One hot encoder object

In [2]:
enc = OneHotEncoder()

In [9]:
majors = [['Engineering'], 
          ['Math'], 
          ['Chemistry']]

In [10]:
enc.fit(majors)

OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=True)

In [11]:
enc.transform(majors).toarray()

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [12]:
enc.categories_

[array(['Chemistry', 'Engineering', 'Math'], dtype=object)]

In [42]:
new_majors = [['Media Studies'], 
              ['Math'],
              ['Stats']]

In [43]:
enc.transform(new_majors).toarray()

ValueError: Found unknown categories ['Stats', 'Media Studies'] in column 0 during transform

In [44]:
enc_unk = OneHotEncoder(handle_unknown='ignore')

enc_unk.fit(majors)

OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='ignore',
              n_values=None, sparse=True)

In [45]:
enc_unk.transform(new_majors).toarray()

array([[0., 0., 0.],
       [0., 0., 1.],
       [0., 0., 0.]])

### Reading csv file

In [18]:
student_info = pd.read_csv('datasets/academic_info.csv')

student_info

Unnamed: 0,StudentID,Department,Nationality,Batch
0,19022,Polical Science,Egypt,2010
1,19087,Journalism,Germany,2012
2,12809,Engineering,Germany,2014
3,12809,Math,China,2014


### Creating a encoder object to encode the column 'Department'

In [19]:
dept_ohe = OneHotEncoder()

In [20]:
dept_ohe_transformed = dept_ohe.fit_transform(student_info[['Department']])

dept_ohe_transformed

<4x4 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [21]:
dept_ohe_transformed.toarray()

array([[0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.]])

In [22]:
dept_ohe.categories_

[array(['Engineering', 'Journalism', 'Math', 'Polical Science'],
       dtype=object)]

### Printing the labels along their encoded value

In [26]:
dept_df = pd.DataFrame(dept_ohe_transformed.toarray(), 
                       columns = dept_ohe.categories_, dtype=np.int)

dept_df

Unnamed: 0,Engineering,Journalism,Math,Polical Science
0,0,0,0,1
1,0,1,0,0
2,1,0,0,0
3,0,0,1,0


### Creating an object to encode the column `Nationality`

In [27]:
nationality_ohe = OneHotEncoder()

In [28]:
nationality_transformed = nationality_ohe\
                            .fit_transform(student_info[['Nationality']])

In [29]:
nationality_transformed.toarray()

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [30]:
nationality_ohe.categories_

[array(['China', 'Egypt', 'Germany'], dtype=object)]

In [32]:
nationality_df = pd.DataFrame(nationality_transformed.toarray(), 
                              columns = nationality_ohe.categories_, dtype=np.int)

nationality_df

Unnamed: 0,China,Egypt,Germany
0,0,1,0
1,0,0,1
2,0,0,1
3,1,0,0


## 2) Getting the same output as OneHotEncoding Using `pandas.get_dummies` 

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html

In [33]:
dummy_dept = pd.get_dummies(student_info['Department'])

dummy_dept

Unnamed: 0,Engineering,Journalism,Math,Polical Science
0,0,0,0,1
1,0,1,0,0
2,1,0,0,0
3,0,0,1,0


In [34]:
dummy_nationality = pd.get_dummies(student_info['Nationality'])

dummy_nationality

Unnamed: 0,China,Egypt,Germany
0,0,1,0
1,0,0,1
2,0,0,1
3,1,0,0


In [35]:
student_info = pd.concat([student_info, 
                           dummy_dept, 
                           dummy_nationality], 
                          axis=1)

student_info

Unnamed: 0,StudentID,Department,Nationality,Batch,Engineering,Journalism,Math,Polical Science,China,Egypt,Germany
0,19022,Polical Science,Egypt,2010,0,0,0,1,0,1,0
1,19087,Journalism,Germany,2012,0,1,0,0,0,0,1
2,12809,Engineering,Germany,2014,1,0,0,0,0,0,1
3,12809,Math,China,2014,0,0,1,0,1,0,0
