In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from rfgap import RFGAP
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from rfoversample import RFOversampler

import sys
sys.path.append("SupportFunctions")

from prepare_datasets import DatasetPreprocessor  
from imbalancer import ImbalanceHandler

from imblearn.over_sampling import SMOTE


In [3]:
titanic = pd.read_csv('./datasets/titanic.csv')
titanic_cat_cols = ['Pclass', 'Sex', 'Embarked']

In [4]:
y = titanic['Survived']
x = titanic.drop(columns=['Survived'])
x_encoded = pd.get_dummies(x, columns=titanic_cat_cols, dtype=int)


In [5]:
original_cat_Oversampler = RFOversampler(x_train=x, y_train=y, contains_categoricals=True, encoded=False, cat_cols=titanic_cat_cols)
x_new_cat , y_new_cat = original_cat_Oversampler.fit()

In [6]:
x

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
707,3,female,39.0,0,5,29.1250,Q
708,2,male,27.0,0,0,13.0000,S
709,1,female,19.0,0,0,30.0000,S
710,1,male,26.0,0,0,30.0000,C


In [7]:
x_new_cat

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.000000,1.000000,0.000000,7.250000,S
1,1,female,38.000000,1.000000,0.000000,71.283300,C
2,3,female,26.000000,0.000000,0.000000,7.925000,S
3,1,female,35.000000,1.000000,0.000000,53.100000,S
4,3,male,35.000000,0.000000,0.000000,8.050000,S
...,...,...,...,...,...,...,...
843,1,female,40.000000,0.666667,0.000000,27.651400,S
844,1,male,35.333333,0.000000,0.000000,21.200000,S
845,1,female,35.333333,0.333333,0.333333,40.477767,S
846,2,female,29.000000,0.333333,0.666667,24.750000,S


In [8]:
encoded_cat_Oversampler = RFOversampler(x_train=x_encoded, y_train=y, contains_categoricals=True, encoded=True, cat_cols=titanic_cat_cols)
x_new_encoded , y_new_encoded = encoded_cat_Oversampler.fit()

In [10]:
x_encoded

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,22.0,1,0,7.2500,0,0,1,0,1,0,0,1
1,38.0,1,0,71.2833,1,0,0,1,0,1,0,0
2,26.0,0,0,7.9250,0,0,1,1,0,0,0,1
3,35.0,1,0,53.1000,1,0,0,1,0,0,0,1
4,35.0,0,0,8.0500,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
707,39.0,0,5,29.1250,0,0,1,1,0,0,1,0
708,27.0,0,0,13.0000,0,1,0,0,1,0,0,1
709,19.0,0,0,30.0000,1,0,0,1,0,0,0,1
710,26.0,0,0,30.0000,1,0,0,0,1,1,0,0


In [11]:
x_new_encoded

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,22.000000,1.000000,0.000000,7.250000,0,0,1,0,1,0,0,1
1,38.000000,1.000000,0.000000,71.283300,1,0,0,1,0,1,0,0
2,26.000000,0.000000,0.000000,7.925000,0,0,1,1,0,0,0,1
3,35.000000,1.000000,0.000000,53.100000,1,0,0,1,0,0,0,1
4,35.000000,0.000000,0.000000,8.050000,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
843,12.806667,0.000000,0.333333,8.000000,0,0,1,1,0,1,0,0
844,21.000000,0.666667,1.000000,45.951400,1,0,0,1,0,0,0,1
845,27.666667,1.333333,1.333333,24.012500,0,1,0,1,0,0,0,1
846,33.333333,0.000000,0.333333,89.306933,1,0,0,1,0,1,0,0


In [1]:
%reset