In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import warnings
warnings.filterwarnings('ignore')

 # Ordinal Encoding - Label Encoding
Ordinal encoding converts categorical data into integer codes ranging from 0 to N-1, where N is the number of unique categories. This method assumes an order or hierarchy among the categories. It's suitable for ordinal data, where the categories have a natural ordered relationship (e.g., ratings like "poor", "good", "excellent").


# One-Hot Encoding
One-hot encoding converts each category value into a new column and assigns a 1 or 0 (True/False) value to the column. This method is suitable for nominal data, where no ordinal relationship exists between the categories. It eliminates the model's assumption about the ordering of categories but increases the data dimensionality.

# Dummy Variable

The dummy variable trap is a scenario in which the independent variables (features) in a regression model are highly correlated.

Methods to Avoid the Dummy Variable Trap

Drop one level: For a categorical variable with 
N categories, use N−1 dummy variables. The dropped category acts as the "reference" or "baseline" category. This approach is automatically handled by setting drop_first=True in pd.get_dummies() in pandas or specifying similar options in other preprocessing tools.

Use regularization: Techniques like Ridge (L2) and Lasso (L1) regression can help manage multicollinearity by penalizing the size of coefficients and, in the case of Lasso, potentially reducing some coefficients to zero.

In [3]:
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# Load Titanic dataset
titanic = sns.load_dataset('titanic')

# Prepare data
data = titanic[['embarked', 'sex']].dropna()

# Ordinal Encoding for 'Embarked'
ordinal_encoder = OrdinalEncoder()
data['embarked_ordinal'] = ordinal_encoder.fit_transform(data[['embarked']])

# One-Hot Encoding for 'Sex'
onehot_encoder = OneHotEncoder(sparse=False)
data_onehot = onehot_encoder.fit_transform(data[['sex']])
data[['sex_male', 'sex_female']] = data_onehot

print(data.head(10))

  embarked     sex  embarked_ordinal  sex_male  sex_female
0        S    male               2.0       0.0         1.0
1        C  female               0.0       1.0         0.0
2        S  female               2.0       1.0         0.0
3        S  female               2.0       1.0         0.0
4        S    male               2.0       0.0         1.0
5        Q    male               1.0       0.0         1.0
6        S    male               2.0       0.0         1.0
7        S    male               2.0       0.0         1.0
8        S  female               2.0       1.0         0.0
9        C  female               0.0       1.0         0.0


#### Dropping 1 column

In [4]:
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Load Titanic dataset from seaborn
titanic = sns.load_dataset('titanic')

# Prepare data focusing on the 'Sex' column
data = titanic[['sex']].dropna()

# Apply One-Hot Encoding to 'Sex' with dropping the first category to avoid dummy variable trap
onehot_encoder = OneHotEncoder(drop='first', sparse=False)
data_onehot = onehot_encoder.fit_transform(data[['sex']])

# Convert the array returned by OneHotEncoder into a DataFrame
columns = onehot_encoder.get_feature_names_out(['sex'])
data_encoded = pd.DataFrame(data_onehot, columns=columns)

# If the original data index is not continuous, reset index of the original data to align with the encoded DataFrame
data.reset_index(drop=True, inplace=True)

# Concatenate the encoded DataFrame with the original data (optional, for demonstration)
data_concatenated = pd.concat([data, data_encoded], axis=1)

print(data_concatenated.head())

      sex  sex_male
0    male       1.0
1  female       0.0
2  female       0.0
3  female       0.0
4    male       1.0


In [5]:
import seaborn as sns
import pandas as pd

# Load Titanic dataset from seaborn
titanic = sns.load_dataset('titanic')

# Selecting a subset of columns for demonstration
data = titanic[['sex', 'embarked', 'class']].dropna()

# Applying get_dummies to 'sex', 'embarked', and 'class' columns with drop_first=True
data_encoded = pd.get_dummies(data, columns=['sex', 'embarked', 'class'], drop_first=True)

print(data_encoded.head())

   sex_male  embarked_Q  embarked_S  class_Second  class_Third
0      True       False        True         False         True
1     False       False       False         False        False
2     False       False        True         False         True
3     False       False        True         False        False
4      True       False        True         False         True
