In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, roc_curve, auc
from imblearn.over_sampling import SMOTE
from sklearn.utils import shuffle
import numpy as np
import pandas as pd

# NA Filled data

In [16]:
class ModelEvaluation:
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.scaler = None
        
    def normalize_data(self):
        self.scaler = StandardScaler().fit(self.X_train)
        self.X_train = self.scaler.transform(self.X_train)
        self.X_test = self.scaler.transform(self.X_test)
    
    def feature_scaling(self):
        self.scaler = MinMaxScaler().fit(self.X_train)
        self.X_train = self.scaler.transform(self.X_train)
        self.X_test = self.scaler.transform(self.X_test)
    
    def remove_outliers(self):
        # Function to remove outliers using IQR
        def remove_outliers_iqr(df):
            numerical_cols = df.select_dtypes(include=['number']).columns
            parts = []
            limit = 1000  # Limit of output rows
            for col in df.columns:
                if col.startswith('N'):
                    q1 = df[col].quantile(0.25)
                    q3 = df[col].quantile(0.75)
                    iqr = q3 - q1
                    lower_bound = q1 - 1.5 * iqr
                    upper_bound = q3 + 1.5 * iqr
                    part = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
                    if len(part) > limit:
                        parts.append(part[:limit])
                    else:
                        parts.append(part)
            return pd.concat(parts)

        self.X_train = remove_outliers_iqr(self.X_train)
        self.y_train = self.y_train.loc[self.X_train.index]
    
    def handle_imbalance(self):
        smote = SMOTE(random_state=42)
        self.X_train, self.y_train = smote.fit_resample(self.X_train, self.y_train)
    
    def logistic_regression(self):
        model = LogisticRegression(max_iter=1000)
        params = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
        grid_search = GridSearchCV(model, params, cv=5, scoring='roc_auc')
        grid_search.fit(self.X_train, self.y_train)
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict_proba(self.X_test)[:, 1]
        auc_score = roc_auc_score(self.y_test, y_pred)
        return best_model, auc_score
    
    def svm(self):
        model = SVC(probability=True)
        params = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
        grid_search = GridSearchCV(model, params, cv=5, scoring='roc_auc')
        grid_search.fit(self.X_train, self.y_train)
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict_proba(self.X_test)[:, 1]
        auc_score = roc_auc_score(self.y_test, y_pred)
        return best_model, auc_score

# Usage example:
if __name__ == "__main__":
    # Assuming you have X_train, X_test, y_train, y_test datasets available
    # Perform data preprocessing steps

    df = pd.read_csv('80%_null_drop_rest_filled.csv')
    bool_map = {True : 1, False:0}
    df['C6'] = df['C6'].map(bool_map)
    df['C8'] = df['C8'].map(bool_map)

    X = df.drop(['Unique_ID', 'Dependent_Variable'], axis = 1)
    y = df['Dependent_Variable']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model_eval = ModelEvaluation(X_train, X_test, y_train, y_test)
    model_eval.remove_outliers()
    model_eval.handle_imbalance()
    model_eval.normalize_data()  

    # Logistic Regression
    logistic_model, logistic_auc = model_eval.logistic_regression()
    print("Logistic Regression AUC:", logistic_auc)
    print("Best parameters for Logistic Regression:", logistic_model.get_params())

    # SVM
    svm_model, svm_auc = model_eval.svm()
    print("SVM AUC:", svm_auc)
    print("Best parameters for SVM:", svm_model.get_params())


Logistic Regression AUC: 0.7234614169571523
Best parameters for Logistic Regression: {'C': 100, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


# Imputed NA

In [None]:
class ModelEvaluation:
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.scaler = None
        
    def normalize_data(self):
        self.scaler = StandardScaler().fit(self.X_train)
        self.X_train = self.scaler.transform(self.X_train)
        self.X_test = self.scaler.transform(self.X_test)
    
    def feature_scaling(self):
        self.scaler = MinMaxScaler().fit(self.X_train)
        self.X_train = self.scaler.transform(self.X_train)
        self.X_test = self.scaler.transform(self.X_test)
    
    def remove_outliers(self):
        # Function to remove outliers using IQR
        def remove_outliers_iqr(df):
            numerical_cols = df.select_dtypes(include=['number']).columns
            parts = []
            limit = 1000  # Limit of output rows
            for col in df.columns:
                if col.startswith('N'):
                    q1 = df[col].quantile(0.25)
                    q3 = df[col].quantile(0.75)
                    iqr = q3 - q1
                    lower_bound = q1 - 1.5 * iqr
                    upper_bound = q3 + 1.5 * iqr
                    part = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
                    if len(part) > limit:
                        parts.append(part[:limit])
                    else:
                        parts.append(part)
            return pd.concat(parts)

        self.X_train = remove_outliers_iqr(self.X_train)
        self.y_train = self.y_train.loc[self.X_train.index]
    
    def handle_imbalance(self):
        smote = SMOTE(random_state=42)
        self.X_train, self.y_train = smote.fit_resample(self.X_train, self.y_train)
    
    def logistic_regression(self):
        model = LogisticRegression(max_iter=1000)
        params = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
        grid_search = GridSearchCV(model, params, cv=5, scoring='roc_auc')
        grid_search.fit(self.X_train, self.y_train)
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict_proba(self.X_test)[:, 1]
        auc_score = roc_auc_score(self.y_test, y_pred)
        return best_model, auc_score
    
    def svm(self):
        model = SVC(probability=True)
        params = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
        grid_search = GridSearchCV(model, params, cv=5, scoring='roc_auc')
        grid_search.fit(self.X_train, self.y_train)
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict_proba(self.X_test)[:, 1]
        auc_score = roc_auc_score(self.y_test, y_pred)
        return best_model, auc_score

# Usage example:
if __name__ == "__main__":
    # Assuming you have X_train, X_test, y_train, y_test datasets available
    # Perform data preprocessing steps

    df = pd.read_csv('80%_null_drop_rest_impute_rf.csv')
    bool_map = {True : 1, False:0}
    df['C6'] = df['C6'].map(bool_map)
    df['C8'] = df['C8'].map(bool_map)

    X = df.drop(['Unique_ID', 'Dependent_Variable'], axis = 1)
    y = df['Dependent_Variable']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model_eval = ModelEvaluation(X_train, X_test, y_train, y_test)
    model_eval.remove_outliers()
    model_eval.handle_imbalance()
    model_eval.normalize_data()  

    # Logistic Regression
    logistic_model, logistic_auc = model_eval.logistic_regression()
    print("Logistic Regression AUC:", logistic_auc)
    print("Best parameters for Logistic Regression:", logistic_model.get_params())

    # SVM
    svm_model, svm_auc = model_eval.svm()
    print("SVM AUC:", svm_auc)
    print("Best parameters for SVM:", svm_model.get_params())
