# Goal
One Hot Encoding for categorical features in Python

In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

#### Sample Data

In [2]:
df = pd.DataFrame(np.array([[1, 1851], [2, 1852], [3, 1852], [4, 1854], [5, 1854], [6, 1854]]), columns=["courseid", "zipcodes"])
df 

Unnamed: 0,courseid,zipcodes
0,1,1851
1,2,1852
2,3,1852
3,4,1854
4,5,1854
5,6,1854


### 1. Using pandas.get_dummies method

In [3]:
df_one = pd.get_dummies(df, columns=["zipcodes"])
df_one

Unnamed: 0,courseid,zipcodes_1851,zipcodes_1852,zipcodes_1854
0,1,1,0,0
1,2,0,1,0
2,3,0,1,0
3,4,0,0,1
4,5,0,0,1
5,6,0,0,1


### 2. Using Sklearn OneHotEncoder

In [17]:
ohe = OneHotEncoder()

In [28]:
ohe.fit(df.zipcodes.values.reshape(-1,1))

OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=None, sparse=True)

In [34]:
df2 = ohe.transform(df.zipcodes.values.reshape(-1,1)).toarray()
df2 = 

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [40]:
final_df = pd.DataFrame(df2, columns=ohe.get_feature_names())
final_df

Unnamed: 0,x0_1851,x0_1852,x0_1854
0,1.0,0.0,0.0
1,0.0,1.0,0.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0
5,0.0,0.0,1.0


### Reversing One Hot Encoding to get back to the original df

In [6]:
melted = df_one.melt(id_vars=['courseid'], var_name = 'zipcodes_string', value_name='value')

melted.head()

Unnamed: 0,courseid,zipcodes_string,value
0,1,zipcodes_1851,1
1,2,zipcodes_1851,0
2,3,zipcodes_1851,0
3,4,zipcodes_1851,0
4,5,zipcodes_1851,0


In [7]:
# Filter the 1s 
m2 = melted[melted.value == 1].reset_index(drop=True)
m2.head()

Unnamed: 0,courseid,zipcodes_string,value
0,1,zipcodes_1851,1
1,2,zipcodes_1852,1
2,3,zipcodes_1852,1
3,4,zipcodes_1854,1
4,5,zipcodes_1854,1


In [9]:
m2["zipcode"] = m2.zipcodes_string.str.extract(r'([0-9]+)', expand=False).astype(int)
m2.loc[:, ["courseid", "zipcode"]].head()

Unnamed: 0,courseid,zipcode
0,1,1851
1,2,1852
2,3,1852
3,4,1854
4,5,1854


TA-DA!!  👽