# End to End Linear Regression 

## Install required libraries

In [None]:
pip install -r requirements.txt

## Load libraries

In [4]:
import scipy.stats as stats
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from scipy.stats import chi2
import ipywidgets as widgets
from matplotlib.gridspec import GridSpec
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.datasets import make_classification 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, roc_auc_score, precision_score, f1_score
from sklearn.feature_selection import SelectFromModel

import warnings
warnings.filterwarnings('ignore') 

## Exploratory Data Analysis

### Set Display Max

In [5]:
pd.set_option('display.max_rows', None)

### Import Data

In [37]:
original_data = pd.read_excel("Healthcare_cat_dataset.xlsx")
data = original_data.copy()

### Data Description

In [38]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1177 entries, 0 to 1176
Data columns (total 53 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   group               1177 non-null   int64  
 1   ID                  1177 non-null   int64  
 2   outcome             1176 non-null   float64
 3   age                 1177 non-null   int64  
 4   gender              1177 non-null   int64  
 5   BMI_cat             1177 non-null   int64  
 6   hypertensive        1177 non-null   int64  
 7   atrialfibrillation  1177 non-null   int64  
 8   CHD with no MI      1177 non-null   int64  
 9   diabetes            1177 non-null   int64  
 10  deficiencyanemias   1177 non-null   int64  
 11  depression          1177 non-null   int64  
 12  Hyperlipemia        1177 non-null   int64  
 13  Renal failure       1177 non-null   int64  
 14  COPD                1177 non-null   int64  
 15  heart rate at       1177 non-null   int64  
 16  Pulse 

In [39]:
data.describe()

Unnamed: 0,group,ID,outcome,age,gender,BMI_cat,hypertensive,atrialfibrillation,CHD with no MI,diabetes,...,cal_cat,chloride_cat,anion_cat,Mag_cat,ph_cat,Biccarbon_cat,metcat,lactic_cat,pco2_cat,ef_cat
count,1177.0,1177.0,1176.0,1177.0,1177.0,1177.0,1177.0,1177.0,1177.0,1177.0,...,1177.0,1177.0,1177.0,1177.0,1177.0,1177.0,1177.0,1177.0,1177.0,1177.0
mean,1.299065,150778.120646,0.135204,74.055225,1.525064,0.4452,0.717927,0.451147,0.085811,0.42141,...,0.486831,0.651657,0.80034,0.672048,0.73407,0.53271,0.029737,0.213254,0.283772,0.570093
std,0.458043,29034.669513,0.342087,13.434061,0.499584,0.497199,0.4502,0.497819,0.280204,0.493995,...,0.500039,0.476648,0.399915,0.469667,0.442015,0.499141,0.169932,0.40978,0.451019,0.495273
min,1.0,100213.0,0.0,19.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,125603.0,0.0,65.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,151901.0,0.0,77.0,2.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
75%,2.0,176048.0,0.0,85.0,2.0,1.0,1.0,1.0,0.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
max,2.0,199952.0,1.0,99.0,2.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Constants

In [40]:
dependent_variable = 'outcome'
hypothesis_testing_metric = 'p_Value_Chi'
hypithesis_feature_header = 'Feature'

### Dropping Non Significant Features

In [41]:
data.drop(['ID', 'group'], axis=1, inplace=True)
data.head()

Unnamed: 0,outcome,age,gender,BMI_cat,hypertensive,atrialfibrillation,CHD with no MI,diabetes,deficiencyanemias,depression,...,cal_cat,chloride_cat,anion_cat,Mag_cat,ph_cat,Biccarbon_cat,metcat,lactic_cat,pco2_cat,ef_cat
0,0.0,72,1,0,0,0,0,1,1,0,...,0,0,1,0,1,0,0,0,1,1
1,0.0,75,2,0,0,0,0,0,1,0,...,0,1,1,1,1,0,0,0,0,1
2,0.0,83,2,1,0,0,0,0,1,0,...,0,1,1,1,1,0,0,0,0,0
3,0.0,43,2,0,0,0,0,0,0,0,...,1,0,1,1,1,0,0,0,0,1
4,0.0,75,2,0,1,0,0,0,1,0,...,1,1,1,0,1,0,0,0,0,1


### Checking for NUll Values

In [63]:
df1 = data.isnull().sum()
df1 = df1[df1 != 0]
impute_columns = df1.index.to_list()
impute_columns

['outcome', 'Pulse rate cat']

### Imputing with Mode as value is binary

In [64]:
def fill_null_with_mode(df, columns):
    for column in columns:
        df[column].fillna(df[column].mode()[0], inplace=True) 

In [65]:
fill_null_with_mode(data, impute_columns)

### Data Visualization w.r.t Dependent Vatiable

In [None]:
import utils as utl

df = data.drop([dependent_variable,'age'], axis=1)
features = df.columns
plot_viz = utl.get_plot_viz()
widgets.interact(plot_viz, feature=features)

### Feature Engineering

#### Derive New feature 

In [None]:
# Add column for anemia & blood pressure
data['derivedAnemia'] = np.where((data['deficiencyanemias'] == 1) & (data['RBC_Cat'] == 1), 1, 0)
data['derivedInflammation'] = np.where((data['neutriphil_cat'] == 1) & (data['Lympho_cat'] == 1), 1, 0)
features = ['derivedAnemia', 'deficiencyanemias', 'RBC_Cat', 'derivedInflammation', 'neutriphil_cat', 'Lympho_cat']

In [None]:
## Plot p-Values 
data.drop(dependent_variable, axis=1, inplace=True)
eval_result = utl.perform_hypothesis_test(dependent_variable, data.columns).sort_values([hypothesis_testing_metric], ascending=True)

plt.rcParams['figure.figsize'] = [8,10]
colors = ["red" if i > 0.05 else "#40A944" for i in eval_result.p_Value_Chi]
plt.barh(eval_result.Feature, eval_result.p_Value_Chi, color = colors) 
# setting label of y-axis
plt.ylabel("Features") 
# setting label of x-axis
plt.xlabel("p-Values")
plt.title("Horizontal bar graph")
plt.show()

In [None]:
eval_result

### Treating Imbalanced Data

original_data[dependent_variable].value_counts()

In [None]:
over = SMOTE(sampling_strategy=1, random_state=42)
under = RandomUnderSampler(sampling_strategy=1, random_state=42)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
X, y = pipeline.fit_resample(data.drop(dependent_variable, axis=1), data[dependent_variable])
data_upsampled = pd.concat([pd.DataFrame(y), pd.DataFrame(X)], axis=1)

### Modeling with different Algos

#### Logistic Regression

In [None]:
from sklearn.datasets import make_classification 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, roc_auc_score, precision_score, f1_score
from sklearn.feature_selection import SelectFromModel

In [None]:
# from preventing overfitting we will do seperate data into train and test

y = mdf_upsampled['outcome']
X = mdf_upsampled.drop(columns = ["outcome"], axis = 1)

X_train,X_test,y_train,y_test = train_test_split(X, y,test_size = 0.3,random_state = 9)

In [None]:
model_logReg = LogisticRegression()
res = model_logReg.fit(X_train, y_train)
pred= model_logReg.predict(X_test)
pred_logi = model_logReg.predict_proba(X_test)

In [None]:
THRESHOLD = 0.5
y_pred_logi = np.where(model_logReg.predict_proba(X_test)[:,1] > THRESHOLD, 1, 0)

In [None]:
logiDF = pd.DataFrame(data=[accuracy_score(y_test, y_pred_logi), recall_score(y_test, y_pred_logi),
                   precision_score(y_test, y_pred_logi), f1_score(y_test, y_pred_logi, average='binary'),
                   roc_auc_score(y_test, y_pred_logi)], 
             index=["accuracy", "recall", "precision", "f1_score", "roc_auc_score"])

logiDF

#### KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn= KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,y_train)
y_pred_knn= knn.predict(X_test)
pred_knn = knn.predict_proba(X_test)

pd.DataFrame(data=[accuracy_score(y_test, y_pred_knn), recall_score(y_test, y_pred_knn),
                   precision_score(y_test, y_pred_knn),  f1_score(y_test, y_pred_knn, average='binary'),
                   roc_auc_score(y_test, y_pred_knn)], 
             index=["accuracy", "recall", "precision", "f1_score", "roc_auc_score"])


#### Decision Tree

In [68]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)

y_pred_dtree = dtree.predict(X_test)
pred_dtree = dtree.predict_proba(X_test)

pd.DataFrame(data=[accuracy_score(y_test, y_pred_dtree), recall_score(y_test, y_pred_dtree),
                   precision_score(y_test, y_pred_dtree),  f1_score(y_test, y_pred_dtree, average='binary'),
                   roc_auc_score(y_test, y_pred_dtree)], 
             index=["accuracy", "recall", "precision", "f1_score", "roc_auc_score"])

#### SVM ( Support Vector Machine)

In [None]:
from sklearn.svm import SVC

In [None]:
svclassifier = SVC(kernel='linear', probability=True)
svclassifier.fit(X_train, y_train)

In [None]:
y_pred_svm = svclassifier.predict(X_test)
pred_svm = svclassifier.predict_proba(X_test)

pd.DataFrame(data=[accuracy_score(y_test, y_pred_svm), recall_score(y_test, y_pred_svm),
                   precision_score(y_test, y_pred_svm),  f1_score(y_test, y_pred_svm, average='binary'),
                   roc_auc_score(y_test, y_pred_svm)], 
             index=["accuracy", "recall", "precision", "f1_score", "roc_auc_score"])

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rforestClassifier = RandomForestClassifier(n_estimators = 100)
rforestClassifier.fit(X_train, y_train)

In [None]:
y_pred_rf = rforestClassifier.predict(X_test)
pred_rf = rforestClassifier.predict_proba(X_test)

pd.DataFrame(data=[accuracy_score(y_test, y_pred_rf), recall_score(y_test, y_pred_rf),
                   precision_score(y_test, y_pred_rf),  f1_score(y_test, y_pred_rf, average='binary'),
                   roc_auc_score(y_test, y_pred_rf)], 
             index=["accuracy", "recall", "precision", "f1_score", "roc_auc_score"])

#### XG Boost Classifier

In [None]:
from xgboost import XGBClassifier

In [None]:
xgbClassifier = XGBClassifier()
xgbClassifier.fit(X_train, y_train)

In [None]:
y_pred_xgb = xgbClassifier.predict(X_test)
pred_xgb = xgbClassifier.predict_proba(X_test)

pd.DataFrame(data=[accuracy_score(y_test, y_pred_xgb), recall_score(y_test, y_pred_xgb),
                   precision_score(y_test, y_pred_xgb),  f1_score(y_test, y_pred_xgb, average='binary'),
                   roc_auc_score(y_test, y_pred_xgb)], 
             index=["accuracy", "recall", "precision", "f1_score", "roc_auc_score"])

#### Models Evaluation using ROC Curve Matrix

In [None]:
from sklearn.metrics import roc_curve

# roc curve for models
fpr1, tpr1, thresh1 = roc_curve(y_test, pred_logi[:,1], pos_label=1)
fpr2, tpr2, thresh2 = roc_curve(y_test, pred_knn[:,1], pos_label=1)
fpr3, tpr3, thresh3 = roc_curve(y_test, pred_dtree[:,1], pos_label=1)
fpr4, tpr4, thresh4 = roc_curve(y_test, pred_svm[:,1], pos_label=1)
fpr5, tpr5, thresh5 = roc_curve(y_test, pred_rf[:,1], pos_label=1)
fpr6, tpr6, thresh6 = roc_curve(y_test, pred_xgb[:,1], pos_label=1)

# roc curve for tpr = fpr 
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)

In [None]:
# matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn')

# plot roc curves
plt.plot(fpr1, tpr1, linestyle='--',color='orange', label='Logistic Regression')
plt.plot(fpr2, tpr2, linestyle='solid',color='green', label='KNN')
plt.plot(fpr3, tpr3, linestyle='dashed',color='red', label='DTree')
plt.plot(fpr4, tpr4, linestyle='solid',color='brown', label='SVM')
plt.plot(fpr5, tpr5, linestyle='dashdot',color='black', label='RF')
plt.plot(fpr6, tpr6, linestyle='-.',color='blue', label='XGB')
plt.plot(p_fpr, p_tpr, linestyle='-', color='pink')
# title
plt.title('ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.legend(loc='best')
plt.savefig('ROC',dpi=300)
plt.show();

### Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from matplotlib import pyplot 

#### Chi2 Scores

In [None]:
from scipy.stats import chi2

In [None]:
evaluationResult = PerformHypothesisTest('outcome', X_train.columns)
ns_df_sorted = evaluationResult.sort_values(['p_Value_Chi'], ascending = True).head(15)
ns_df_sorted

#### K Best Features

In [None]:
sel_significant_columns = SelectKBest(mutual_info_classif,k= 15)
sel_significant_columns.fit(X_train,y_train)

names = X_train.columns.values[sel_significant_columns.get_support()]
scores = sel_significant_columns.scores_[sel_significant_columns.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns=['Feat_names', 'F_Scores'])
#Sort the dataframe for better visualization
ns_df_sorted = ns_df.sort_values(['F_Scores', 'Feat_names'], ascending = [False, True])
ns_df_sorted

plt.figure(figsize = (20,8))
ns_df_sorted.plot(kind='bar')

#### ExtraTree Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

extra_tree_forest = ExtraTreesClassifier(n_estimators = 100, criterion ='gini', max_features = 15)
extra_tree_forest.fit(X, y)

feature_importance = extra_tree_forest.feature_importances_


feature_importance_normalized = np.std([tree.feature_importances_ 
                                        for tree in extra_tree_forest.estimators_],
                                        axis = 0)
features = pd.Series(feature_importance_normalized, index=X.columns).nlargest(15)

etf_features = pd.DataFrame(features)

etf_features

### Recursive Feature selection 

#### Using Logistic Regression

In [None]:
from sklearn.feature_selection import RFE

logreg = LogisticRegression()
logreg_rfe_model = RFE(estimator=logreg,n_features_to_select=15)
logreg_model_fit = logreg_rfe_model.fit(X_train,y_train)
logreg_feat_index = pd.Series(data = logreg_model_fit.ranking_, index = X_train.columns)
logreg_feat_rfe = logreg_feat_index[logreg_feat_index==1].index

logreg_selected_features = pd.DataFrame(logreg_feat_rfe)
logreg_selected_features

#### Using SVM

svm_lin=SVC(kernel='linear')
svm_rfe_model=RFE(estimator=svm_lin,n_features_to_select=15)
svm_rfe_model_fit=svm_rfe_model.fit(X_train,y_train)
feat_index = pd.Series(data = svm_rfe_model_fit.ranking_, index = X_train.columns)
signi_feat_rfe = feat_index[feat_index==1].index

svm_selected_features = pd.DataFrame(signi_feat_rfe)
svm_selected_features

#### Using Random Forest

In [None]:
clf = RandomForestClassifier(n_estimators = 100)
clf_rfe_model=RFE(estimator=clf,n_features_to_select=15)
clf_model_fit=clf_rfe_model.fit(X_train,y_train)
feat_index = pd.Series(data = clf_model_fit.ranking_, index = X_train.columns)
signi_feat_rfe = feat_index[feat_index==1].index

rf_selected_features = pd.DataFrame(signi_feat_rfe)
rf_selected_features