# This is from the BUCC Data Science Workshop I Attended In 2019 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
#visualizations
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

#algorithms
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

#score metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


# Data Preprocessing

In [None]:

alf_path = '/kaggle/input/d/suhailsh7/acute-liver-failure/ALF_Data.csv'
df = pd.read_csv(alf_path)
copy_df=df
df.head(10)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
#dropping samples that dont have value fore ALF
df = df.dropna(axis = 0, subset=['ALF'])

In [None]:
total_missingvalues = df.isnull().sum()
total_missingvalues

In [None]:
#selecting a sample of the features for easier understanding
df = df[['Age','Gender','Region','Weight','Height','Body Mass Index','Obesity','Waist',
         'Maximum Blood Pressure','Minimum Blood Pressure','Good Cholesterol','Bad Cholesterol',
         'Total Cholesterol','Dyslipidemia','PVD','ALF']]

df.head()

In [None]:
#refer to slide for heat map 
# calculate the correlation matrix
corr = df.corr()

# plot the heatmap
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns)

In [None]:
df.corr()

In [None]:
y = df['ALF']
df = df.drop('ALF',axis=1)
df.head()

In [None]:
total_missingvalues = df.isnull().sum()
total_missingvalues

In [None]:
#Taking care of missing values
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer = imputer.fit(df.iloc[:,3:13]) #SELECTING THE COLUMN WITH MISSING VALUES
df.iloc[:,3:13] = imputer.transform(df.iloc[:,3:13])

In [None]:
df.info()

In [None]:
df.head()

In [None]:
#checking number of classes in the categorical feature
df['Region'].unique()

In [None]:
df['Gender'].unique()

In [None]:
#Encode categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

labelencoder_X = LabelEncoder()
df.iloc[:,1] = labelencoder_X.fit_transform(df.iloc[:,1]) #SELECTING THE COLUMN WITH OBJECT TYPE

df=pd.get_dummies(df, columns=["Region"], prefix=["Region"])

In [None]:
df.head()

In [None]:
#dropping Region_west because the model can infer the values for this from the other 3 columns
df = df.drop('Region_west',axis = 1)
df.head()

In [None]:
#splitting our dataset into training sets and teset sets
X = df
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify = y)

In [None]:
xgb = XGBClassifier(random_state=10)

In [None]:
xgb.fit(X_train,y_train)

In [None]:
pred = xgb.predict(X_test)

In [None]:
count = 0
for i in range( len(y_test) ):
    if pred[i] != y_test.iloc[i]: 
        count = count + 1

In [None]:
error = count/len(pred)
print( "Error for XGBoost= %f " % (error*100) + '%' )
accuracy = (1-error)
print( "Accuracy for XGBoost = %f " % (accuracy*100) + '%' )

In [None]:
rf = RandomForestClassifier(random_state=10)
rf.fit(X_train,y_train)
pred_rf = rf.predict(X_test)
count = 0
for i in range( len(y_test) ):
    if pred_rf[i] != y_test.iloc[i]: 
        count = count + 1
error = count/len(pred_rf)
print( "Error for RF = %f " % (error*100) + '%' )
accuracy = (1-error)
print( "Accuracy for RF = %f " % (accuracy*100) + '%' )

In [None]:
cv_results = cross_val_score(rf, X,y, cv = 4, scoring='neg_log_loss', n_jobs = -1)
cv_results

In [None]:
prt_string = "Log Loss: %f " % (-1*cv_results.mean())
                                                        
print(prt_string)

In [None]:
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt'],
    'max_depth' : [4,5,],
    'criterion' :['gini', 'entropy']
}

In [None]:
from sklearn.model_selection import GridSearchCV
CV_rfc = GridSearchCV(estimator=rf, param_grid=param_grid, cv= 2)
CV_rfc.fit(X_train, y_train)

In [None]:
best_param = CV_rfc.best_params_

In [None]:
rf = RandomForestClassifier(**best_param)

In [None]:
rf.fit(X_train,y_train)
pred_rf = rf.predict(X_test)
count = 0
for i in range( len(y_test) ):
    if pred_rf[i] != y_test.iloc[i]: 
        count = count + 1
error = count/len(pred_rf)
print( "Error for RF = %f " % (error*100) + '%' )
accuracy = (1-error)
print( "Accuracy for RF = %f " % (accuracy*100) + '%' )

In [None]:
print(copy_df.shape)
#dropping samples that dont have value fore ALF
copy_df = copy_df.dropna(axis = 0, subset=['ALF'])

y = copy_df['ALF']
df = copy_df.drop('ALF',axis=1)
X=df

In [None]:
X = X.drop(['Gender','Region','Source of Care'], axis=1)
X.head()

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer = imputer.fit(X.iloc[:,:]) #SELECTING THE COLUMN WITH MISSING VALUES
X.iloc[:,:] = imputer.transform(X.iloc[:,:])

In [None]:
rf = RandomForestClassifier(**best_param)
rf.fit(X, y)
print(rf.feature_importances_)

In [None]:
variable = [ ]
name=[]
for i in range(len(rf.feature_importances_)):
    
    if (rf.feature_importances_[i] >=0.03):
        variable.append(i)
        name.append(rf.feature_importances_[i])
print(variable)
print(name)

print(len(variable))

In [None]:
X=X.iloc[:,variable]

In [None]:
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify = y)
rf = RandomForestClassifier(**best_param)
rf.fit(X_train,y_train)
pred_rf = rf.predict(X_test)
count = 0
for i in range( len(y_test) ):
    if pred_rf[i] != y_test.iloc[i]: 
        count = count + 1
error = count/len(pred_rf)
print( "Error for RF = %f " % (error*100) + '%' )
accuracy = (1-error)
print( "Accuracy for RF = %f " % (accuracy*100) + '%' )

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
test = SelectKBest(score_func=f_classif, k=6)
kbestfit = test.fit(X, y)
kbestbool=kbestfit.get_support()
count=0
kbestchi_feature=[]
for i in kbestbool:
    if i:
        kbestchi_feature.append(count)
    count=count+1
print(len(kbestchi_feature))
print(kbestchi_feature)