# Converting qualitative to quatitative

### import required packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### load the data

In [2]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


### EDA

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB



## using LabelEncoder

- LabelEncoder uses interal numeric number to represent the unique categories in the feature

In [4]:
df['sex'].value_counts()

sex
male      843
female    466
Name: count, dtype: int64

In [5]:
df['sex'].unique()

array(['female', 'male'], dtype=object)

In [6]:
from sklearn.preprocessing import LabelEncoder

# create encoder object
encoder = LabelEncoder()

# fit the data into the encoder
# this line wont change the data
encoder.fit(df['sex'])

In [7]:
# find the classes or labels or unique values from the feature
encoder.classes_

array(['female', 'male'], dtype=object)

In [10]:
# change the data type to numeric
df['sex'] = encoder.transform(df['sex'])

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   int64  
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(3), int64(5), object(6)
memory usage: 143.3+ KB


In [None]:
f

In [13]:
encoder_embarked = LabelEncoder()
df['embarked'] = encoder_embarked.fit_transform(df['embarked'])

In [15]:
encoder_embarked.classes_

array(['C', 'Q', 'S', nan], dtype=object)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   int64  
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1309 non-null   int64  
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(3), int64(6), object(5)
memory usage: 143.3+ KB


In [16]:
df['embarked'].mode()

0    2
Name: embarked, dtype: int64

## using One Hot Encoder

- the row will contain only one place with value 1 and rest of the columns with value 0
- e.g. male and female
    - name    gender age
    - person1 male   20
    - person2 female 45
    
    - after one hot encoding
    
    - name    gender_male  gender_female  age
    - person1 1            0              20
    - person2 0            1              45

In [17]:
df = pd.DataFrame([
    {"name": "person1", "gender": "male", "age": 20},
    {"name": "person2", "gender": "female", "age": 23},
    {"name": "person3", "gender": "male", "age": 25},
    {"name": "person4", "gender": "male", "age": 22},
    {"name": "person5", "gender": "female", "age": 26}
])
df

Unnamed: 0,name,gender,age
0,person1,male,20
1,person2,female,23
2,person3,male,25
3,person4,male,22
4,person5,female,26


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    5 non-null      object
 1   gender  5 non-null      object
 2   age     5 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 252.0+ bytes


In [19]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [22]:
# create the encoder
one_hot_encoder = OneHotEncoder(handle_unknown="ignore")

# create a list of columns which need to be converted to numeric
columns = ['gender']

# create a strategy transform the columns using column transformer
transformer = ColumnTransformer(
    [("OneHotEndoder", one_hot_encoder, columns)], 
    remainder='passthrough')

# tranforms the columns
transformer.fit_transform(df)

array([[0.0, 1.0, 'person1', 20],
       [1.0, 0.0, 'person2', 23],
       [0.0, 1.0, 'person3', 25],
       [0.0, 1.0, 'person4', 22],
       [1.0, 0.0, 'person5', 26]], dtype=object)