# Encoding

In [18]:
import pandas as pd 

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# Load Data

In [9]:
df = pd.read_csv("./data/immun.csv")

df.head()


Unnamed: 0,age,sickness,city,immun
0,old,yes,Aachen,low
1,old,no,Aachen,low
2,old,no,Aachen,low
3,mid,yes,Berlin,low
4,mid,yes,Berlin,low


In [10]:

df_inputs = df.drop("immun", axis = "columns")
df_inputs.head()

# Output
target = df[["immun"]]
target.head()

Unnamed: 0,immun
0,low
1,low
2,low
3,low
4,low


# Label Encoder

In [11]:
le_age = LabelEncoder()
le_sickness = LabelEncoder()
le_city = LabelEncoder()
le_immun = LabelEncoder()


df_inputs["age"]  =  le_age.fit_transform(df_inputs["age"])
df_inputs["sickness"]  =  le_sickness.fit_transform(df_inputs["sickness"])
df_inputs["city"]  =  le_city.fit_transform(df_inputs["city"])

target["immun"]  =  le_immun.fit_transform(target["immun"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target["immun"]  =  le_immun.fit_transform(target["immun"])


In [8]:
df_inputs.head()

Unnamed: 0,age,sickness,city
0,2,1,0
1,2,0,0
2,2,0,0
3,0,1,1
4,0,1,1


# Dummy Variables

In [13]:
df = pd.read_csv("./data/immun.csv")

df.head()


Unnamed: 0,age,sickness,city,immun
0,old,yes,Aachen,low
1,old,no,Aachen,low
2,old,no,Aachen,low
3,mid,yes,Berlin,low
4,mid,yes,Berlin,low


In [15]:
dummies = pd.get_dummies(df["city"], dtype = int)

dummies

Unnamed: 0,Aachen,Berlin,Frankfurt
0,1,0,0
1,1,0,0
2,1,0,0
3,0,1,0
4,0,1,0
5,0,1,0
6,0,0,1
7,0,0,1
8,0,0,1
9,0,0,1


In [17]:
df_dummy = pd.concat([df, dummies], axis =  "columns")

df_dummy

Unnamed: 0,age,sickness,city,immun,Aachen,Berlin,Frankfurt
0,old,yes,Aachen,low,1,0,0
1,old,no,Aachen,low,1,0,0
2,old,no,Aachen,low,1,0,0
3,mid,yes,Berlin,low,0,1,0
4,mid,yes,Berlin,low,0,1,0
5,mid,no,Berlin,high,0,1,0
6,mid,no,Frankfurt,high,0,0,1
7,new,yes,Frankfurt,high,0,0,1
8,new,no,Frankfurt,high,0,0,1
9,new,no,Frankfurt,high,0,0,1


# Trapping Dummy variable

- deletes the last feature column , to save the feature count

In [None]:
df_dummy.drop(["Frakfurt"])

# One-Hot Encoding
      
- 0 :  1 0 0 0 0 0 0 0 0 0 .. 0
- 1 :  0 1 0 0 0 0 0 .........0
- 2 :  0 0 1 0 0 0 0 .........0
- 3 : 
- ...
- 9 : 

In [20]:
encoder = OneHotEncoder(sparse_output= False)

X = df[["city"]]
encoder.fit(X) 

X_encoded = encoder.transform(X)

X_encoded

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [23]:
encoder.get_feature_names_out()

array(['city_Aachen', 'city_Berlin', 'city_Frankfurt'], dtype=object)

In [26]:
df_one_hot_encoding  = pd.DataFrame(X_encoded, columns= list(encoder.get_feature_names_out()))

df_one_hot_encoding.head()


df_final = pd.concat([df, df_one_hot_encoding], axis = "columns")
df_final

Unnamed: 0,age,sickness,city,immun,city_Aachen,city_Berlin,city_Frankfurt
0,old,yes,Aachen,low,1.0,0.0,0.0
1,old,no,Aachen,low,1.0,0.0,0.0
2,old,no,Aachen,low,1.0,0.0,0.0
3,mid,yes,Berlin,low,0.0,1.0,0.0
4,mid,yes,Berlin,low,0.0,1.0,0.0
5,mid,no,Berlin,high,0.0,1.0,0.0
6,mid,no,Frankfurt,high,0.0,0.0,1.0
7,new,yes,Frankfurt,high,0.0,0.0,1.0
8,new,no,Frankfurt,high,0.0,0.0,1.0
9,new,no,Frankfurt,high,0.0,0.0,1.0
