In [62]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from rfgap import RFGAP
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from rfoversample import RFOversampler

import sys
sys.path.append("SupportFunctions")

from prepare_datasets import DatasetPreprocessor  
from imbalancer import ImbalanceHandler

from imblearn.over_sampling import SMOTE


In [63]:
titanic = pd.read_csv('./datasets/titanic.csv')
titanic_cat_cols = ['Pclass', 'Sex', 'Embarked']

In [64]:
#encode titanic dataset's categorical features, use preprocessor (built off sklearn train-test split) to split the dataset
encoded_titanic = pd.get_dummies(titanic, columns=titanic_cat_cols, dtype=int)
encoded_preprocessor = DatasetPreprocessor(encoded_titanic, target_column='Survived')
x_train1, y_train1, x_test1, y_test1 = (encoded_preprocessor.x_train, encoded_preprocessor.y_train,
                                    encoded_preprocessor.x_test, encoded_preprocessor.y_test)

#Introduce a 10:1 imbalance ratio to the training dataset
encoded_Imbalancer = ImbalanceHandler(x_train1, y_train1, imbalance_ratio=0.1) 
x_train_imbal1, y_train_imbal1 = encoded_Imbalancer.introduce_imbalance()

encoded_TitanicOversampler = RFOversampler(x_train_fold=x_train_imbal1, y_train_fold=y_train_imbal1, contains_categoricals=True, encoded=True, cat_cols=titanic_cat_cols)
RF_upsampled_x_train1, RF_upsampled_y_train1 = encoded_TitanicOversampler.fit()

RF_upsampled_x_train1

Original class distribution: {0: 302, 1: 196}
Imbalanced class distribution: {0: 80, 1: 20}


Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,18.000000,0.000000,0.000000,7.795800,0,0,1,0,1,0,0,1
1,26.000000,1.000000,0.000000,7.854200,0,0,1,0,1,0,0,1
2,10.000000,3.000000,2.000000,27.900000,0,0,1,0,1,0,0,1
3,21.000000,1.000000,0.000000,9.825000,0,0,1,1,0,0,0,1
4,40.000000,0.000000,0.000000,15.750000,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
155,15.000000,0.666667,0.333333,19.090267,0,1,0,1,0,0,0,1
156,32.666667,0.333333,0.000000,31.484733,1,0,0,1,0,1,0,0
157,28.666667,0.333333,0.000000,33.705567,1,0,0,1,0,0,0,1
158,24.333333,0.000000,0.000000,15.408333,0,1,0,1,0,0,0,1


In [65]:
preprocessor = DatasetPreprocessor(titanic, target_column='Survived')
x_train2, y_train2, x_test2, y_test2 = (preprocessor.x_train, preprocessor.y_train,
                                    preprocessor.x_test, preprocessor.y_test)

#Introduce a 10:1 imbalance ratio to the training dataset
Imbalancer = ImbalanceHandler(x_train2, y_train2, imbalance_ratio=0.1) 
x_train_imbal2, y_train_imbal2 = Imbalancer.introduce_imbalance()

TitanicOversampler = RFOversampler(x_train_fold=x_train_imbal2, y_train_fold=y_train_imbal2, contains_categoricals=True, encoded=False, cat_cols=titanic_cat_cols) #encoded=False
RF_upsampled_x_train2, RF_upsampled_y_train2 = TitanicOversampler.fit()

RF_upsampled_x_train2

Original class distribution: {0: 302, 1: 196}
Imbalanced class distribution: {0: 80, 1: 20}


Unnamed: 0,Age,SibSp,Parch,Fare,Pclass,Sex,Embarked
0,18.000000,0.000000,0.000000,7.795800,3,1,2
1,26.000000,1.000000,0.000000,7.854200,3,1,2
2,10.000000,3.000000,2.000000,27.900000,3,1,2
3,21.000000,1.000000,0.000000,9.825000,3,0,2
4,40.000000,0.000000,0.000000,15.750000,2,0,2
...,...,...,...,...,...,...,...
155,33.000000,0.000000,0.000000,17.516667,2,0,2
156,24.000000,1.000000,0.333333,39.809733,1,0,2
157,26.000000,0.000000,0.000000,7.406967,3,1,2
158,23.333333,0.333333,0.000000,36.516667,1,0,2


In [61]:
%reset

Nothing done.
