In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, QuantileTransformer
from sklearn.feature_selection import SelectKBest, f_classif

In [None]:
data = pd.read_csv('./penguins.csv')
data.head()

In [None]:
#data preparation for the upcoming methods
data["species"] = data["species"].astype('category')
data["island"] = data["island"].astype('category')
data["sex"] = data["sex"].astype('category')
data.dtypes
categorical_data = data.drop(['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', \
			      'body_mass_g'], axis=1)
categorical_data.head()

In [15]:
#Variable Transformation -> Log Transform
log_data = data['body_mass_g']
log_data['body_mass_log'] = (data['body_mass_g']).transform(np.log)
log_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  log_data['body_mass_log'] = (data['body_mass_g']).transform(np.log)


0                                                           3750.0
1                                                           3800.0
2                                                           3250.0
3                                                              NaN
4                                                           3450.0
                                       ...                        
340                                                         3400.0
341                                                         3775.0
342                                                         4100.0
343                                                         3775.0
body_mass_log    0      8.229511
1      8.242756
2      8.08641...
Name: body_mass_g, Length: 345, dtype: object

In [8]:
#data preparation for the upcoming methods
categorical_data = data.drop(['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', \
			      'body_mass_g'], axis=1)
categorical_data.head()

Unnamed: 0,species,island,sex,year
0,Adelie,Torgersen,male,2007
1,Adelie,Torgersen,female,2007
2,Adelie,Torgersen,female,2007
3,Adelie,Torgersen,,2007
4,Adelie,Torgersen,female,2007


In [9]:
#Label Encoding
categorical_data["species_cat"] = categorical_data["species"].cat.codes
categorical_data["island_cat"] = categorical_data["island"].cat.codes
categorical_data["sex_cat"] = categorical_data["sex"].cat.codes
categorical_data.head()

Unnamed: 0,species,island,sex,year,species_cat,island_cat,sex_cat
0,Adelie,Torgersen,male,2007,0,2,1
1,Adelie,Torgersen,female,2007,0,2,0
2,Adelie,Torgersen,female,2007,0,2,0
3,Adelie,Torgersen,,2007,0,2,-1
4,Adelie,Torgersen,female,2007,0,2,0


In [10]:
#One-hot Encoding
encoded_spicies = pd.get_dummies(categorical_data['species'])
encoded_island = pd.get_dummies(categorical_data['island'])
encoded_sex = pd.get_dummies(categorical_data['sex'])

categorical_data = categorical_data.join(encoded_spicies)
categorical_data = categorical_data.join(encoded_island)
categorical_data = categorical_data.join(encoded_sex)
categorical_data.head()

Unnamed: 0,species,island,sex,year,species_cat,island_cat,sex_cat,Adelie,Chinstrap,Gentoo,Biscoe,Dream,Torgersen,female,male
0,Adelie,Torgersen,male,2007,0,2,1,1,0,0,0,0,1,0,1
1,Adelie,Torgersen,female,2007,0,2,0,1,0,0,0,0,1,1,0
2,Adelie,Torgersen,female,2007,0,2,0,1,0,0,0,0,1,1,0
3,Adelie,Torgersen,,2007,0,2,-1,1,0,0,0,0,1,0,0
4,Adelie,Torgersen,female,2007,0,2,0,1,0,0,0,0,1,1,0


In [11]:
#Count Encoding
species_count = categorical_data['species'].value_counts()
island_count = categorical_data['island'].value_counts()
sex_count = categorical_data['sex'].value_counts()

categorical_data['species_count_enc'] = categorical_data['species'].map(species_count)
categorical_data['island_count_enc'] = categorical_data['island'].map(island_count)
categorical_data['sex_count_enc'] = categorical_data['sex'].map(sex_count)

categorical_data.head()

Unnamed: 0,species,island,sex,year,species_cat,island_cat,sex_cat,Adelie,Chinstrap,Gentoo,Biscoe,Dream,Torgersen,female,male,species_count_enc,island_count_enc,sex_count_enc
0,Adelie,Torgersen,male,2007,0,2,1,1,0,0,0,0,1,0,1,152,52,168.0
1,Adelie,Torgersen,female,2007,0,2,0,1,0,0,0,0,1,1,0,152,52,165.0
2,Adelie,Torgersen,female,2007,0,2,0,1,0,0,0,0,1,1,0,152,52,165.0
3,Adelie,Torgersen,,2007,0,2,-1,1,0,0,0,0,1,0,0,152,52,
4,Adelie,Torgersen,female,2007,0,2,0,1,0,0,0,0,1,1,0,152,52,165.0


In [12]:
#Start Target Encoding
categorical_data["species"] = categorical_data["species"].cat.codes
island_means = categorical_data.groupby('island')['species'].mean()
sex_means = categorical_data.groupby('sex')['species'].mean()

In [13]:
island_means
sex_means

sex
female    0.909091
male      0.928571
Name: species, dtype: float64

In [14]:
categorical_data['island_target_enc'] = categorical_data['island'].map(island_means)
categorical_data['sex_target_enc'] = categorical_data['sex'].map(sex_means)
categorical_data
#end of Target Encoding

Unnamed: 0,species,island,sex,year,species_cat,island_cat,sex_cat,Adelie,Chinstrap,Gentoo,Biscoe,Dream,Torgersen,female,male,species_count_enc,island_count_enc,sex_count_enc,island_target_enc,sex_target_enc
0,0,Torgersen,male,2007,0,2,1,1,0,0,0,0,1,0,1,152,52,168,0.000000,0.928571
1,0,Torgersen,female,2007,0,2,0,1,0,0,0,0,1,1,0,152,52,165,0.000000,0.909091
2,0,Torgersen,female,2007,0,2,0,1,0,0,0,0,1,1,0,152,52,165,0.000000,0.909091
3,0,Torgersen,,2007,0,2,-1,1,0,0,0,0,1,0,0,152,52,,0.000000,
4,0,Torgersen,female,2007,0,2,0,1,0,0,0,0,1,1,0,152,52,165,0.000000,0.909091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339,1,Dream,male,2009,1,1,1,0,1,0,0,1,0,0,1,68,124,168,0.548387,0.928571
340,1,Dream,female,2009,1,1,0,0,1,0,0,1,0,1,0,68,124,165,0.548387,0.909091
341,1,Dream,male,2009,1,1,1,0,1,0,0,1,0,0,1,68,124,168,0.548387,0.928571
342,1,Dream,male,2009,1,1,1,0,1,0,0,1,0,0,1,68,124,168,0.548387,0.928571
