In [1]:
# Reference: https://chrisalbon.com/machine-learning/convert_pandas_categorical_column_into_integers_for_scikit-learn.html

In [31]:
from sklearn import preprocessing
import pandas as pd
from sklearn.pipeline import Pipeline

In [12]:
raw_data = {'patient': [1,1,1,2,2],
            'obs': [1,2,3,1,2],
            'treatment':[0,1,0,1,0],
            'score': ['strong', 'weak','normal','weak','strong']}

In [13]:
df = pd.DataFrame(raw_data, columns = ['patient','obs','treatment','score'])

In [14]:
df

Unnamed: 0,patient,obs,treatment,score
0,1,1,0,strong
1,1,2,1,weak
2,1,3,0,normal
3,2,1,1,weak
4,2,2,0,strong


In [15]:
le = preprocessing.LabelEncoder()

In [16]:
le.fit(df['score'])

LabelEncoder()

In [17]:
list(le.classes_)

['normal', 'strong', 'weak']

In [18]:
le.transform(df['score'])

array([1, 2, 0, 2, 1], dtype=int64)

In [19]:
df

Unnamed: 0,patient,obs,treatment,score
0,1,1,0,strong
1,1,2,1,weak
2,1,3,0,normal
3,2,1,1,weak
4,2,2,0,strong


In [26]:
df['score'] = list(le.transform(df['score']))

In [27]:
df

Unnamed: 0,patient,obs,treatment,score
0,1,1,0,1
1,1,2,1,2
2,1,3,0,0
3,2,1,1,2
4,2,2,0,1


In [28]:
df['score_transform'] = list(le.inverse_transform(df['score']))

In [29]:
df

Unnamed: 0,patient,obs,treatment,score,score_transform
0,1,1,0,1,strong
1,1,2,1,2,weak
2,1,3,0,0,normal
3,2,1,1,2,weak
4,2,2,0,1,strong


In [46]:
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
        'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
        'age': [42, 52, 36, 24, 73],
        'city': ['San Francisco', 'Baltimore', 'Miami', 'Douglas', 'Boston']}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'city'])
df

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miller,42,San Francisco
1,Molly,Jacobson,52,Baltimore
2,Tina,Ali,36,Miami
3,Jake,Milner,24,Douglas
4,Amy,Cooze,73,Boston


In [47]:
df_city_dummies = pd.get_dummies(df['city'])

In [48]:
df = pd.merge(df, df_city_dummies, left_index=True, right_index=True)
del df['city']
df

Unnamed: 0,first_name,last_name,age,Baltimore,Boston,Douglas,Miami,San Francisco
0,Jason,Miller,42,0.0,0.0,0.0,0.0,1.0
1,Molly,Jacobson,52,1.0,0.0,0.0,0.0,0.0
2,Tina,Ali,36,0.0,0.0,0.0,1.0,0.0
3,Jake,Milner,24,0.0,0.0,1.0,0.0,0.0
4,Amy,Cooze,73,0.0,1.0,0.0,0.0,0.0


In [49]:
# Another way of converting nominal to dummy variables

In [61]:
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
        'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
        'age': [42, 52, 36, 24, 73],
        'city': ['San Francisco', 'Baltimore', 'Miami', 'Douglas', 'Boston']}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'city'])

In [62]:
df_label_encoded = preprocessing.LabelEncoder().fit_transform(df['city'])
df_label_encoded

array([4, 0, 3, 2, 1], dtype=int64)

In [63]:
df_encoded = pd.DataFrame(preprocessing.OneHotEncoder().fit_transform(df_label_encoded.reshape(-1,1)).toarray())
df_encoded.columns = ['Baltimore', 'Boston', 'Douglas', 'Miami', 'San Francisco']
df = pd.merge(df, df_encoded, left_index = True, right_index = True)
del df['city']
df

Unnamed: 0,first_name,last_name,age,Baltimore,Boston,Douglas,Miami,San Francisco
0,Jason,Miller,42,0.0,0.0,0.0,0.0,1.0
1,Molly,Jacobson,52,1.0,0.0,0.0,0.0,0.0
2,Tina,Ali,36,0.0,0.0,0.0,1.0,0.0
3,Jake,Milner,24,0.0,0.0,1.0,0.0,0.0
4,Amy,Cooze,73,0.0,1.0,0.0,0.0,0.0
