# Handling Biased data

In [31]:
import seaborn as sns
import pandas as pd
from sklearn.utils import resample

# Load Titanic dataset
df = sns.load_dataset('titanic')

# Drop rows with missing 'age' or 'embarked'
df = df.dropna(subset=['age', 'embarked'])

# Select features and target
X = df[['pclass', 'sex', 'age', 'embarked']]
y = df['survived']

X_encoded = pd.get_dummies(X, drop_first=True)


# Combine features and target
df_combined = pd.concat([X_encoded, y], axis=1)

In [32]:
X_encoded

Unnamed: 0,pclass,age,sex_male,embarked_Q,embarked_S
0,3,22.0,True,False,True
1,1,38.0,False,False,False
2,3,26.0,False,False,True
3,1,35.0,False,False,True
4,3,35.0,True,False,True
...,...,...,...,...,...
885,3,39.0,False,True,False
886,2,27.0,True,False,True
887,1,19.0,False,False,True
889,1,26.0,True,False,False


In [33]:
# Separate majority and minority classes
df_majority = df_combined[df_combined['survived'] == 0]
df_minority = df_combined[df_combined['survived'] == 1]

In [34]:
df_minority.count()

pclass        288
age           288
sex_male      288
embarked_Q    288
embarked_S    288
survived      288
dtype: int64

In [35]:
df_majority.count()

pclass        424
age           424
sex_male      424
embarked_Q    424
embarked_S    424
survived      424
dtype: int64

In [36]:
# Upsample minority class
df_minority_upsampled = resample(df_minority,
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)

df_minority_upsampled.count()

pclass        424
age           424
sex_male      424
embarked_Q    424
embarked_S    424
survived      424
dtype: int64

In [37]:
# Combine back
df_balanced = pd.concat([df_majority, df_minority_upsampled])
df_balanced.count()

pclass        848
age           848
sex_male      848
embarked_Q    848
embarked_S    848
survived      848
dtype: int64

In [39]:
# Upsample minority class
df_majority_downsampled = resample(df_majority,
                                 replace=True,
                                 n_samples=len(df_minority),
                                 random_state=42)

df_majority_downsampled.count()

pclass        288
age           288
sex_male      288
embarked_Q    288
embarked_S    288
survived      288
dtype: int64

In [40]:
df_balanced = pd.concat([df_majority_downsampled, df_minority])
df_balanced.count()

pclass        576
age           576
sex_male      576
embarked_Q    576
embarked_S    576
survived      576
dtype: int64

## SMOTE

In [41]:
## SMOTE upsampling
!pip install imbalanced-learn



In [42]:
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE(random_state=42)

X_resampled, y_resampled = smote.fit_resample(X_encoded, y)

from collections import Counter
print(Counter(y_resampled))

Counter({0: 424, 1: 424})


## ADASYN

In [47]:
from imblearn.over_sampling import ADASYN

# Initialize ADASYN
adasyn = ADASYN(random_state=42)

# Fit and resample
X_resampled, y_resampled = adasyn.fit_resample(X_encoded, y)

from collections import Counter
print(Counter(y_resampled))

Counter({0: 424, 1: 384})
