In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import roc_auc_score, roc_curve, auc
from imblearn.over_sampling import SMOTE
import pandas as pd

## NA Filled

In [5]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import roc_auc_score, roc_curve, auc
from imblearn.over_sampling import SMOTE

class ModelEvaluation:
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.scaler = None
        
    def normalize_data(self):
        self.scaler = StandardScaler().fit(self.X_train)
        self.X_train = self.scaler.transform(self.X_train)
        self.X_test = self.scaler.transform(self.X_test)
    
    def feature_scaling(self):
        self.scaler = MinMaxScaler().fit(self.X_train)
        self.X_train = self.scaler.transform(self.X_train)
        self.X_test = self.scaler.transform(self.X_test)
    
    def remove_outliers(self):
        # Your outlier removal code here (using IQR range)
        num_cols = [col for col in self.X_train.columns if col.startswith('N')]
        for col in num_cols:
            Q1 = self.X_train[col].quantile(0.25)
            Q3 = self.X_train[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            self.X_train = self.X_train[(self.X_train[col] >= lower_bound) & (self.X_train[col] <= upper_bound)]
            self.X_test = self.X_test[(self.X_test[col] >= lower_bound) & (self.X_test[col] <= upper_bound)]
            self.y_train = self.y_train.loc[self.X_train.index]
            self.y_test = self.y_test.loc[self.X_test.index]
    
    def handle_imbalance(self):
        smote = SMOTE(random_state=42)
        self.X_train, self.y_train = smote.fit_resample(self.X_train, self.y_train)
    
    def build_neural_network(self, input_dim):
        model = Sequential()
        model.add(Dense(128, activation='relu', input_dim=input_dim))
        model.add(Dropout(0.5))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(1, activation='sigmoid'))

        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model
    
    def train_neural_network(self, model):
        model.fit(self.X_train, self.y_train, epochs=20, batch_size=64, validation_split=0.2, verbose=0)
        return model
    
    def evaluate_model(self, model):
        y_pred = model.predict(self.X_test)[:, 0]
        fpr, tpr, thresholds = roc_curve(self.y_test, y_pred)
        auc_score = auc(fpr, tpr)
        return auc_score

    def run_evaluation(self):
        self.handle_imbalance()  # Handle imbalance first
        self.remove_outliers()  # Then remove outliers
        self.normalize_data()   # Normalize the data
        input_dim = self.X_train.shape[1]
        model = self.build_neural_network(input_dim)
        trained_model = self.train_neural_network(model)
        auc_score = self.evaluate_model(trained_model)
        return auc_score



# Usage example:
if __name__ == "__main__":
    # Assuming you have X_train, X_test, y_train, y_test datasets available
    # Perform data preprocessing steps

    df = pd.read_csv('../data_preprocessing/80%_null_drop_rest_filled.csv')
    bool_map = {True : 1, False:0}
    df['C6'] = df['C6'].map(bool_map)
    df['C8'] = df['C8'].map(bool_map)

    X = df.drop(['Unique_ID', 'Dependent_Variable'], axis = 1)
    y = df['Dependent_Variable']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model_eval = ModelEvaluation(X_train, X_test, y_train, y_test)
    auc_score = model_eval.run_evaluation()
    print("Neural Network AUC:", auc_score)


Neural Network AUC: 0.6991192699490663


# Imputed NA

In [8]:
df = pd.read_csv('../data_preprocessing/80%_null_drop_rest_impute_rf.csv')
df.isnull().sum()
df.head()

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,N1,N2,...,N20,N21,N22,N23,N24,N33,N34,N35,Dependent_Variable,Unique_ID
0,0,0,3,25,0,0,0,1,23.75,81.34271,...,21.764142,0.857199,0.906422,27.816,1750.0,58.0,113.39,12.0,1,Candidate_5926
1,0,15,12,63,2,0,1,1,11.05,22.0,...,17.0,0.88,1.0,40.0,10833.33333,160.0,262.1,17.0,0,Candidate_48134
2,0,0,11,12,0,0,0,1,29.0,81.34271,...,21.764142,0.857199,0.906422,20.0,6250.0,24.0,50.29,18.0,1,Candidate_51717
3,0,1,8,42,1,0,4,1,17.99,1.0,...,6.0,1.0,0.0,26.0,2413.666667,70.0,126.52,27.0,0,Candidate_26401
4,0,1,5,1,1,1,6,1,27.5,206.0,...,31.0,0.96,0.0,44.0,7666.666667,100.0,205.47,21.0,0,Candidate_34872
