In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from scipy import stats
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

from numpy import mean
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV



In [43]:
#Load dataset

titanic = pd.read_csv("dataset.csv")

In [44]:
# Data Cleaning

titanic['Y'] = np.where(titanic['Y'] == 1 , 0, 1)

titanic.dropna(inplace=True)


titanic['44'] = titanic['44'].astype(str)






In [45]:
# Outlier Removal

titanic = titanic.reset_index()
''' Detection '''
columns = titanic.columns
columns= columns[:-3]

for col in columns: 
    # IQR
    Q1 = np.percentile(titanic[col], 25,
                       interpolation = 'midpoint')

    Q3 = np.percentile(titanic[col], 75,
                       interpolation = 'midpoint')
    IQR = Q3 - Q1

    # Upper bound
    upper = np.where(titanic[col] >= (Q3+3*IQR))
    # Lower bound
    lower = np.where(titanic[col] <= (Q1-3*IQR))
    

    ''' Removing the Outliers '''
    titanic.drop(upper[0], inplace = True)
    titanic.drop(lower[0], inplace = True)

    print("New Shape: ", titanic.shape)
    titanic = titanic.reset_index(drop=True)

    


New Shape:  (59959, 46)
New Shape:  (59959, 46)
New Shape:  (59959, 46)
New Shape:  (59959, 46)
New Shape:  (59937, 46)
New Shape:  (59937, 46)
New Shape:  (59937, 46)
New Shape:  (59936, 46)
New Shape:  (59918, 46)
New Shape:  (59918, 46)
New Shape:  (59918, 46)
New Shape:  (59918, 46)
New Shape:  (59918, 46)
New Shape:  (59911, 46)
New Shape:  (59911, 46)
New Shape:  (59903, 46)
New Shape:  (59903, 46)
New Shape:  (59903, 46)
New Shape:  (59903, 46)
New Shape:  (59903, 46)
New Shape:  (59903, 46)
New Shape:  (59903, 46)
New Shape:  (59900, 46)
New Shape:  (59899, 46)
New Shape:  (59899, 46)
New Shape:  (59899, 46)
New Shape:  (59898, 46)
New Shape:  (59898, 46)
New Shape:  (59898, 46)
New Shape:  (59898, 46)
New Shape:  (59898, 46)
New Shape:  (59898, 46)
New Shape:  (59898, 46)
New Shape:  (59898, 46)
New Shape:  (59898, 46)
New Shape:  (59898, 46)
New Shape:  (59898, 46)
New Shape:  (59866, 46)
New Shape:  (59866, 46)
New Shape:  (59850, 46)
New Shape:  (59660, 46)
New Shape:  (596

In [46]:
# Encoding Categorical variables

sortbin = pd.get_dummies(titanic['44'], prefix='WS_SB')
rad_zone= pd.get_dummies(titanic['45'], prefix='ZONE')

titanic = titanic.join(sortbin)
titanic = titanic.join(rad_zone)
print(titanic.head(10))

   index         1         2         3         4         5         6  \
0      0  3.988833  2.230952  2.234090  2.224613  2.225998  2.216250   
1      1  3.974519  2.226723  2.250789  2.212954  2.213453  2.222252   
2      2  4.108639  2.411715  2.410257  2.399034  2.419996  2.415526   
3      3  4.461258  2.583675  2.572882  2.574881  2.557647  2.589007   
4      4  4.152431  2.408509  2.400121  2.441874  2.428172  2.454793   
5      5  4.202709  2.389834  2.407209  2.394249  2.399547  2.396865   
6      6  4.115625  2.367207  2.366003  2.354787  2.364106  2.384145   
7      7  4.127285  2.396199  2.371090  2.404679  2.382529  2.376507   
8      8  4.035359  2.311777  2.284966  2.336921  2.326167  2.322340   
9      9  4.197079  2.409826  2.412902  2.384033  2.389567  2.422198   

          7         8         9  ...  Y  WS_SB_1001.0  WS_SB_2002.0  \
0  2.212547  2.211953  2.215073  ...  0             1             0   
1  2.207449  2.228175  2.223283  ...  0             1            

In [47]:
# Drop categorical values already encoded

titanic.drop(['index'], axis=1, inplace=True)
titanic.drop(['44'], axis=1, inplace=True)
titanic.drop(['45'], axis=1, inplace=True)


y = titanic.Y.copy()

X = titanic.drop(['Y'], axis=1)




In [48]:
class NormalDataset():
    def __init__(self, X, y,train=0, scale_data=True):
        #if not torch.is_tensor(X) and not torch.is_tensor(y):
      # Apply scaling if necessary
            
        if scale_data:
            
            if train==1:
                X_scale=ct.fit_transform(X)

            elif train==0:
                X_scale=ct.transform(X)

        self.X=X_scale.astype('float64')
        self.y=y
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, i):
        return self.X[i], self.y[i]

In [49]:
# Class used for tuning threshold values for multiclass models

class proxyModel():
    def __init__(self, origin_model):
        self.origin_model = origin_model

    def predict_proba(self, x, threshold_list=None):
        # get origin probability
        ori_proba = self.origin_model.predict_proba(x)

        # set default threshold
        if threshold_list is None:
            threshold_list = np.full(ori_proba[0].shape, 1)
            print(threshold_list)

        # get the output shape of threshold_list
        output_shape = np.array(threshold_list).shape

        # element-wise divide by the threshold of each classes
        new_proba = np.divide(ori_proba, threshold_list)

        # calculate the norm (sum of new probability of each classes)
        norm = np.linalg.norm(new_proba, ord=1, axis=1)

        # reshape the norm
        norm = np.broadcast_to(np.array([norm]).T, (norm.shape[0],output_shape[0]))

        # renormalize the new probability
        new_proba = np.divide(new_proba, norm)

        return new_proba

    def predict(self, x, threshold_list=None):
        return np.argmax(self.predict_proba(x, threshold_list), axis=1)

In [50]:
# Prepare test amd train dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True)



columns = X.columns.tolist()

columns = columns[:len(columns)-9]

sc=StandardScaler()
ct = ColumnTransformer([
                     ('somename', sc, columns)
                     ], remainder='passthrough')


dataset = NormalDataset(X_train, y_train,1)
test = NormalDataset(X_test, y_test)



In [51]:
# Use SMOTE for imbalanced dataset

oversample = SMOTE(random_state=0)
X_os, y_os = oversample.fit_resample(dataset.X, dataset.y)



New_sample= np.hstack((X_os,y_os.values.reshape(-1,1)))


In [None]:
# Perform grid search on various hyper-parameters

grid = { 
    'n_estimators': [50, 100],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [6,8],
    'criterion' :['friedman_mse', 'squared_error'],
    'random_state' : [18]
}
model = GradientBoostingClassifier()


cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
rf_cv = GridSearchCV(estimator=model, param_grid=grid, scoring='roc_auc', cv= cv)
rf_cv.fit(X_os, y_os)


rf_cv.best_params_


In [18]:
#redefine model based on best known parameters 

model = GradientBoostingClassifier(n_estimators=100,learning_rate=0.05, max_depth=10,max_features='sqrt')
model.fit(X_os, y_os)

In [32]:

pd.DataFrame(test.X).to_csv('input.csv')

#threshold_list= [0.35,98.63,0.15,0.87]

#P_model=proxyModel(model)


y_pred = pd.Series(np.where(model.predict_proba(test.X)[:,1] > 0.02, 1, 0))
#y_pred=pd.DataFrame(P_model.predict(test.X, threshold_list))
y_test = y_test.reset_index(drop=True)
z = pd.concat([y_test, y_pred], axis=1)
z.columns = ['True', 'Prediction']
z.to_csv('output.csv')



In [33]:
#Print evaluation metrics

print("Accuracy:", metrics.accuracy_score(test.y, y_pred))

print("Recall:", metrics.recall_score(test.y, y_pred, average=None))


Recall CCX: [0.08476815 0.97826087]
