In [None]:
# Importing the libraries
import pandas as pd
import numpy as np
import sklearn.model_selection as model_selection
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import statsmodels.api as sm
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
# Method to find missing value percentage 
def missing_percent(df):
    return (df.isnull().sum() * 100/ len(df))

#Method to impute missing values and create missing value indicator column
def imputer_with_indicator(df):
    # Get the columns with missing values
    cols_with_missing = (col for col in df.columns if df[col].isnull().any())
    # Create new binary columns from the columns with missing values
    for col in cols_with_missing:
        df[col + '_missing'] = df[col].isnull()
    # Keep the column names
    column_names=df.columns
    # Impute the dataframe
    my_imputer = SimpleImputer()
    df = pd.DataFrame(my_imputer.fit_transform(df.values))
    # Add the column names of the original data
    df.columns = column_names
    return df

In [None]:
#Read the dataset
data = pd.read_csv("C:/Users/VineetJ/Downloads/PHY_TRAIN.csv")
data

In [None]:
pd.set_option("max_rows", None)
# Summary statistics for all variables
data.agg([missing_percent, 'mean', 'std', 'min', 'max', 'skew']).T

In [None]:
pd.reset_option("max_rows")

In [None]:
# Columns with missing values
data.columns[data.isna().any()].tolist()

In [None]:
data_cpy = data.copy()
# Missing value indicator column and missing value imputation 
data_cpy = imputer_with_indicator(data_cpy)

In [None]:
data_cpy.columns[data_cpy.isna().any()].tolist()

In [None]:
data_cpy.describe()

In [None]:
X=data_cpy.loc[:, data_cpy.columns != 'target']
y=data_cpy['target']

In [None]:
# Finding important features
cols = list(X.columns)
featureImportance = pd.DataFrame()
pmax = 1
while (len(cols)>0):
    p= []
    X_1 = X[cols]
    X_1 = sm.add_constant(X_1)
    model = sm.OLS(y,X_1).fit()
    p = pd.Series(model.pvalues.values[1:],index = cols)      
    pmax = max(p)
    feature_with_p_max = p.idxmax()
    # Compare p-value with threshold value to distinguish important features
    if(pmax>0.05):
        cols.remove(feature_with_p_max)
    else:
        featureImportance = pd.DataFrame(p)
        break
# List of important features        
selected_features_BE = cols
print(selected_features_BE)
featureImportance = featureImportance.reset_index()
featureImportance.columns = ['feature','pValue']
featureImportance.sort_values('pValue',inplace=True)
featureImportance = featureImportance.reset_index(drop=True)
featureImportance

In [None]:
featureImportance.plot(kind='bar',y='pValue',x='feature')

In [None]:
XL1 = X[selected_features_BE]

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(XL1, y, test_size=0.20, random_state=0)

In [None]:
# Logistic Regression
logreg = LogisticRegression(max_iter=200)
logreg.fit(X_train, y_train)

In [None]:
y_pred = logreg.predict(X_test)
logreg.score(X_test, y_test)

In [None]:
# Performance metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report")
print(classification_report(y_test, y_pred))

In [None]:
# AUC of ROC
y_pred_proba = logreg.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="model 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
# Baseline the logistic regression result
baseline = logreg.score(X_test, y_test)
features = selected_features_BE
regression = LogisticRegression(max_iter=200)

In [None]:
# Selecting important interaction terms
interactions = list()
for f_A in features:
    for f_B in features:
        if f_A > f_B:
            X_train['interaction'] = X_train[f_A] * X_train[f_B]
            X_test['interaction'] = X_test[f_A]* X_test[f_B]
            regression.fit(X_train, y_train)
            y_pred=regression.predict(X_test)
            acc=metrics.accuracy_score(y_pred, y_test)
            # Compare accuracy with baseline value to select important interaction terms 
            if acc > baseline:
                interactions.append((f_A, f_B, round(acc,4)))

In [None]:
# Sort interaction terms by accuracy
d = pd.DataFrame(interactions)
d.sort_values(by=2)

In [None]:
# Add four interaction terms with highest accuracy
XL2 = XL1.copy()
XL2['newcol1']=XL2['feat75'] * XL2['feat12']
XL2['newcol2']=XL2['feat4'] * XL2['feat12']
XL2['newcol3']=XL2['feat42'] * XL2['feat31']
XL2['newcol4']=XL2['feat66'] * XL2['feat12']

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(XL2, y, test_size=0.20, random_state=0)

In [None]:
# Logistic Regression with interaction terms
logreg = LogisticRegression(max_iter=200)
logreg.fit(X_train, y_train)

In [None]:
y_pred = logreg.predict(X_test)
logreg.score(X_test, y_test)

In [None]:
# Performance metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report")
print(classification_report(y_test, y_pred))

In [None]:
# AUC of ROC
y_pred_proba = logreg.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="model 2, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.20, random_state=0)

In [None]:
# Random Forest
regressor = RandomForestClassifier(n_estimators=500, random_state=123)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
regressor.score(X_test, y_test)

In [None]:
# Performance metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report")
print(classification_report(y_test, y_pred))

In [None]:
# AUC of ROC
y_pred_proba = regressor.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="model 3, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.20, random_state=0)

In [None]:
# Gradient Boosting
gb_clf2 = GradientBoostingClassifier(n_estimators=1000, random_state=123)
gb_clf2.fit(X_train, y_train)
y_pred = gb_clf2.predict(X_test)
gb_clf2.score(X_test, y_test)

In [None]:
# Performance metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report")
print(classification_report(y_test, y_pred))

In [None]:
# AUC of ROC
y_pred_proba = gb_clf2.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="model 4, auc="+str(auc))
plt.legend(loc=4)
plt.show()