In [36]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [6]:
# import dataset
df = sns.load_dataset('planets')
df.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [25]:
# check the categorical columns
df.dtypes[df.dtypes == 'object']

method    object
dtype: object

In [33]:
# get method column and show the frequencies of the unique values
method = df[['method']].values
(unique, counts) = np.unique(method, return_counts=True)
frequencies = np.asarray((unique, counts)).T
frequencies

array([['Astrometry', 2],
       ['Eclipse Timing Variations', 9],
       ['Imaging', 38],
       ['Microlensing', 23],
       ['Orbital Brightness Modulation', 3],
       ['Pulsar Timing', 5],
       ['Pulsation Timing Variations', 1],
       ['Radial Velocity', 553],
       ['Transit', 397],
       ['Transit Timing Variations', 4]], dtype=object)

# Label Encoder

In [31]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

In [40]:
new_method = le.fit_transform(method)
new_method

array([7, 7, 7, ..., 8, 8, 8])

In [41]:
df['method'] = new_method

In [45]:
df['method'].value_counts()

7    553
8    397
2     38
3     23
1      9
5      5
9      4
4      3
0      2
6      1
Name: method, dtype: int64

In [46]:
df.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,7,1,269.3,7.1,77.4,2006
1,7,1,874.774,2.21,56.95,2008
2,7,1,763.0,2.6,19.84,2011
3,7,1,326.03,19.4,110.62,2007
4,7,1,516.22,10.5,119.47,2009


# One-Hot Encoder

In [47]:
# import dataset again
df = sns.load_dataset('planets')
df.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [74]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(categories='auto')

In [65]:
method = df['method'].values.reshape((-1,1))
method

array([['Radial Velocity'],
       ['Radial Velocity'],
       ['Radial Velocity'],
       ...,
       ['Transit'],
       ['Transit'],
       ['Transit']], dtype=object)

In [75]:
method_arr = ohe.fit_transform(method).toarray()
method_arr

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [77]:
method_labels = ohe.categories_

In [79]:
new_method = pd.DataFrame(method_arr, columns=method_labels)
new_method

Unnamed: 0,Astrometry,Eclipse Timing Variations,Imaging,Microlensing,Orbital Brightness Modulation,Pulsar Timing,Pulsation Timing Variations,Radial Velocity,Transit,Transit Timing Variations
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
1030,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [82]:
df = pd.concat([new_method, df.drop(['method'], axis=1)], axis=1)
df.head()

Unnamed: 0,"(Astrometry,)","(Eclipse Timing Variations,)","(Imaging,)","(Microlensing,)","(Orbital Brightness Modulation,)","(Pulsar Timing,)","(Pulsation Timing Variations,)","(Radial Velocity,)","(Transit,)","(Transit Timing Variations,)",number,orbital_period,mass,distance,year
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,269.3,7.1,77.4,2006
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,874.774,2.21,56.95,2008
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,763.0,2.6,19.84,2011
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,326.03,19.4,110.62,2007
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,516.22,10.5,119.47,2009
