# Cardio Vascular disease analysis prediction significance

In [3]:

import os
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
import warnings
warnings.filterwarnings("ignore")





In [4]:
os.getcwd()


'C:\\Users\\HP'

In [7]:
df=pd.read_csv("Cardio_vascular.csv")

In [9]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [None]:
df.shape

#### The dataset used in this article is the Cleveland Heart Disease dataset. There are 14 columns in the dataset, 
which are described below.
1. Age, in years
2. Sex, 1 = male; 0 = female
3. cp: chest pain type
– Value 0: typical angina
– Value 1: atypical angina
– Value 2: non-anginal pain
– Value 3: asymptomatic
4. trestbps: resting blood pressure (in mm Hg on admission to the hospital)
5. chol: serum cholestoral in mg/dAttribute Information:
6. fbs: (fasting blood sugar > 120 mg/dl) 1 = true; 0 =false
7. restecg: resting electrocardiographic results
– Value 0: normal
– Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of >0.05 mV)
– Value 2: showing probable or definite left ventricu lar hypertrophy by Estes’ criteria
8. thalach: maximum heart rate achieved during stress TEST
9. exang: exercise induced angina, 1 = yes; 0 = no
10. oldpeak = ST depression induced by exercise relative to rest
11. slope: the slope of the peak exercise ST segment
– Value 0: upsloping
– Value 1: flat
– Value 2: downsloping
12. ca: number of major vessels (0-4) colored by flourosopy
13. thal: thalassmia, 0 = null; 1=normal; 2 = fixed defect; 3 = reversible defect
14. condition (target) : 0 = no disease, 1 = disease


In [None]:
df.rename(columns={"sex":"gender","cp":"rest_cp","trestbps":"rest_bp","chol":"cholesterol","fbs":"fast_bloodsugar",
                   "restecg":"rest_ecg","thalach":"stress_HR","exang":"Exercise_cp","oldpeak":"STdepression_Exerc",
                   "slope":"STpeak_exerc","ca":"coloured_vessels","thal":"thalassmia","target":"heart_disease"},inplace=True)

In [None]:
df.info()


In [None]:
df.describe()

In [None]:
df.nunique()

In [None]:
skewness1=df.skew()


In [None]:
skewness1

In [None]:
kurtosis1=df.kurt()
kurtosis1

In [None]:
df.heart_disease.value_counts()

In [None]:
sns.countplot(x="heart_disease", data=df)

In [None]:
df["rest_cp"].value_counts()

In [None]:
df["fast_bloodsugar"].value_counts()

In [None]:
df["rest_ecg"].value_counts()

In [None]:
df["Exercise_cp"].value_counts()

In [None]:
df["STpeak_exerc"].value_counts()

In [None]:
df["coloured_vessels"].value_counts()

In [None]:
df["thalassmia"].value_counts()

In [None]:
# DELETING THE ROWS WHERE thalassmia==0

df = df[(df['thalassmia'] > 0)]

In [None]:
df["thalassmia"].value_counts()

In [None]:
df.describe()

In [None]:
df["cholesterol"].value_counts()

In [None]:
gender_count=sns.countplot(x="gender", data=df,hue="heart_disease")
gender_count.set(xlabel="gender of patients")

In [None]:
ax = sns.barplot(x="gender", y="heart_disease", data=df,hue="heart_disease", estimator=lambda x: len(x) / len(df) * 100)
ax.set(ylabel="Percent")

In [None]:
sns.countplot(x="rest_cp", data=df)

In [None]:
ax = sns.barplot(x="heart_disease", y="rest_cp", data=df,hue="rest_cp", estimator=lambda x: len(x) / len(df) * 100)
ax.set(ylabel="Percent")

In [None]:
ax = sns.barplot(x="gender", y="rest_cp", data=df,hue="rest_cp", estimator=lambda x: len(x) / len(df) * 100)
ax.set(ylabel="Percent")

In [None]:
df["gender"].value_counts()

In [None]:
sns.histplot(data=df,x="age",hue="heart_disease",bins=20,kde=True)
plt.show()

In [None]:
sns.countplot(data= df, x='heart_disease',hue='thalassmia')
plt.title('thal Type v/s target\n')

In [None]:
# condition (target) : 0 = no disease, 1 = disease


In [None]:
sns.countplot(data= df, x='gender',hue='thalassmia')
plt.title('Gender v/s Thalassemia\n')
print('Thalassemia (thal-uh-SEE-me-uh) is an inherited blood disorder that causes your body to have less hemoglobin than normal. Hemoglobin enables red blood cells to carry oxygen')

In [None]:
sns.boxplot(data=df,x='heart_disease',y='age')

In [None]:
df.isnull().sum()

In [None]:
df.heart_disease.value_counts()


In [None]:
df.nunique()

In [None]:
df.info()

# NEXT STEP TO DO THE PREPROCESSING EDA PART 

# OUTLIER TRETMENT  

In [None]:
# checking outliners in boxplot

plt.figure(figsize=(15,12)) # Set plot dimensions
sns.boxplot(data=df)

In [None]:
plt.figure(figsize=(5,5)) # Set plot dimensions
sns.boxplot(x="rest_bp",data=df)

In [None]:
plt.figure(figsize=(5,5)) # Set plot dimensions
sns.boxplot(x="cholesterol",data=df)

In [None]:
plt.figure(figsize=(5,5)) # Set plot dimensions
sns.boxplot(x="stress_HR",data=df)

# Flooring Capping option - adjustment of outlier- no delete, IQR method

In [None]:
Q1_rest_bp=df["rest_bp"].quantile(0.25)

Q3_rest_bp=df["rest_bp"].quantile(0.75)
IQR1=Q3_rest_bp-Q1_rest_bp
upper_limit_rest_bp=(Q3_rest_bp+1.5*IQR1)
print("upper_limit_rest_bp",upper_limit_rest_bp)
lower_limit_rest_bp=(Q1_rest_bp-1.5*IQR1)
print("lower_limit_rest_bp",lower_limit_rest_bp)

In [None]:
Q1_cholesterol=df["cholesterol"].quantile(0.25)

Q3_cholesterol=df["cholesterol"].quantile(0.75)
IQR2=Q3_cholesterol-Q1_cholesterol
upper_limit_cholesterol=(Q3_cholesterol+1.5*IQR2)
print("upper_limit_cholesterol",upper_limit_cholesterol)
lower_limit_cholesterol=(Q1_cholesterol-1.5*IQR2)
print("lower_limit_cholesterol",lower_limit_cholesterol)

In [None]:
Q1_stress_HR=df["stress_HR"].quantile(0.25)

Q3_stress_HR=df["stress_HR"].quantile(0.75)
IQR3=Q3_stress_HR-Q1_stress_HR
upper_limit_stress_HR=(Q3_stress_HR+1.5*IQR3)
print("upper_limit_stress_HR",upper_limit_stress_HR)
lower_limit_stress_HR=(Q1_stress_HR-1.5*IQR3)
print("lower_limit_stress_HR",lower_limit_stress_HR)

In [None]:
df['rest_bp']=np.where(df['rest_bp']>upper_limit_rest_bp,upper_limit_rest_bp,np.where(df['rest_bp']<lower_limit_rest_bp,lower_limit_rest_bp,df['rest_bp']))

In [None]:
df['cholesterol']=np.where(df['cholesterol']>upper_limit_cholesterol,upper_limit_cholesterol,np.where(df['cholesterol']<lower_limit_cholesterol,lower_limit_cholesterol,df['cholesterol']))

In [None]:
df['stress_HR']=np.where(df['stress_HR']>upper_limit_stress_HR,upper_limit_stress_HR,np.where(df['stress_HR']<lower_limit_stress_HR,lower_limit_stress_HR,df['stress_HR']))

In [None]:
plt.figure(figsize=(15,12)) # Set plot dimensions
sns.boxplot(data=df)

In [None]:
df.info()

In [None]:
# No ROW DELETED , adjusted the outliers.

In [None]:
X=df.drop(["heart_disease"],axis=1)
Y=df["heart_disease"]

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
#apply SelectKBest class to extract top best features
bestfeatures = SelectKBest(score_func=chi2, k=13)
fit = bestfeatures.fit(X,Y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Features','Score'] #naming the dataframe columns
print(featureScores.nlargest(12,'Score')) #print best features


In [None]:
X.describe()

# multicollinearity problems

In [None]:
X.corr() # it how independent variables are collinear to each other

In [None]:
df.corr()

In [None]:
#Same data we can plot in heatmap for more better understanding.
plt.figure(figsize=(15,12))
sns.heatmap(df.corr(),annot=True)

In [None]:
# although there is no Multicolliniearilty ,if a value more than 0.8 ( specific to Medical and clinical sector data it consider as multicollinearity).

In [None]:
#SPLITTING THE DATA IN TRAINING AND TEST

from sklearn.model_selection import train_test_split


In [None]:
print(X.shape)
print(Y.shape)

In [None]:
#splitting data in traing and test
X_train1, X_test1,Y_train1,Y_test1=train_test_split(X,Y,train_size=0.70,random_state=101)
print(X_train1.shape)
print(X_test1.shape)
print(Y_train1.shape)
print(Y_test1.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
scaljob=StandardScaler()
X_train1=scaljob.fit_transform(X_train1)
X_test1=scaljob.fit_transform(X_test1)
X_train1
X_test1

# To study this data set we implemented various models, applying the models

# Buidling KNN MODEL

In [None]:


from sklearn .metrics import classification_report,confusion_matrix,accuracy_score

In [None]:
#WORKING WITH SCALLED DATASET

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_S_5=KNeighborsClassifier(n_neighbors=5) 
knn_S_5.fit(X_train1,Y_train1)


Y_pred_KNN_train1=knn_S_5.predict(X_train1)
Y_pred_KNN_test1=knn_S_5.predict(X_test1)

In [None]:
print("KNN training accuracy", accuracy_score(Y_train1,Y_pred_KNN_train1))
print("######"*5)
print("KNN testing accuracy", accuracy_score(Y_test1,Y_pred_KNN_test1))

# overfitting underfitting

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [None]:
# find the more effective value of n_neighbors parameter:
accuracy_K = []
for k in range(1, 50):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train1, Y_train1)
    Y_pred = knn.predict(X_test1)
    accuracy =accuracy_score(Y_test1, Y_pred)
    accuracy_K.append(accuracy)
   
plt.figure(figsize=(12,8))
plt.xlabel("k values")
plt.ylabel("ACCURACY")
plt.plot(range(1,50),accuracy_K, marker='o', markersize=9)

In [None]:
# we can try different k values 5 or 7
knn_S_25=KNeighborsClassifier(n_neighbors=5) 
knn_S_25.fit(X_train1,Y_train1)


Y_pred_KNN_train1=knn_S_25.predict(X_train1)
Y_pred_KNN_test1=knn_S_25.predict(X_test1)

In [None]:
print("KNN training accuracy", accuracy_score(Y_train1,Y_pred_KNN_train1))
print("######"*5)
print("KNN testing accuracy", accuracy_score(Y_test1,Y_pred_KNN_test1))

# CROSS VALIDATION CV

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
accuracy_training=cross_val_score(knn_S_25,X_train1,Y_train1,cv=20)
accuracy_testing=cross_val_score(knn_S_25,X_test1,Y_test1,cv=15)
print(accuracy_training)
#print(accuracy_testing)

In [None]:
print("accuracy training",accuracy_training[1] )
print("accuracy testing",accuracy_testing[0] )

In [None]:



from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
knn_conf_mat=confusion_matrix(Y_test1,Y_pred_KNN_test1)

print(confusion_matrix(Y_test1,Y_pred_KNN_test1))






In [None]:
print(knn_conf_mat)

In [None]:
TP = knn_conf_mat[0][0]
FP = knn_conf_mat[0][1]
FN = knn_conf_mat[1][0]
TN = knn_conf_mat[1][1]
                  
print("True positive",TP)
print("False positive",FP)
print("False Negative",FN)                  
print("True Negative",TN)

In [None]:
#Visualization Confusion Matrix
plt.subplots(figsize=(7,7))
sns.heatmap(knn_conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",data=X)
plt.xlabel("ACUTAL Values")
plt.ylabel("PREDICTED Values")
plt.show()

In [None]:
print("## CLASSIFICATION REPORT KNN TRAINING MODEL ######")
print(classification_report(Y_train1,Y_pred_KNN_train1))
print("## CLASSIFICATION REPORT KNN TESTING MODEL  ######")
print(classification_report(Y_test1,Y_pred_KNN_test1))

In [None]:
print(accuracy_score(Y_test1,Y_pred_KNN_test1))

# KNN ROC AUC PREDICTION

In [None]:
Specificity=TN/(TN+FP)
Recall = TP/(TP+FN)
Balanced_ACCURACY=(Recall +Specificity)/2
print("Balanced Accuracy",Balanced_ACCURACY)

In [None]:
from sklearn.metrics import roc_auc_score



# predict probabilities
probability_prediction_positive = knn.predict_proba(X_test1)[:,1]

# auc scores
auc_score1 = roc_auc_score(Y_test1, probability_prediction_positive)


from sklearn.metrics import roc_curve

# roc curve for models
fpr1, tpr1, thresh1 = roc_curve(Y_test1, probability_prediction_positive, pos_label=1)




print("AUC SCORE:",auc_score1)
#print("#######"*5)
#print("value of FPR", fpr1)
#print("#######"*5)
#print("Values of TPR",tpr1)
#print("#######"*5)
#print("values of Threshold", thresh1)

In [None]:
#roc_df = pd.DataFrame(zip(fpr1, tpr1, thresh1),columns = ["FPR","TPR","Threshold"])
#roc_df


In [None]:
# matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn')

# plot roc curves
plt.plot(fpr1, tpr1, linestyle='--',color='orange', label='KNN')

# plot no skill roc curve
plt.plot([0, 1], [0, 1], linestyle='--', label='No Skill')

# title
plt.title('ROC curve')

# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.legend(loc='best')

plt.savefig('ROC', dpi=300)
plt.show()

In [None]:
#Running knn.predict_proba(X_test)[:,1] get the predicted probabilities of the positive label only, which yield as below:-

#probability2= knn.predict_proba(X_test1)[:,1]
#probability2

# SVM Support Vector Machine Algorithm

In [None]:

  
from sklearn.svm import SVC
  
# Building a Support Vector Machine on train data
svm = SVC(C= .1, kernel='linear', gamma= 1,probability=True)
svm.fit(X_train1, Y_train1)
Y_pred_SVM_train = svm .predict(X_train1)
Y_pred_SVM_test = svm .predict(X_test1)
# check the accuracy on the training set
print("traing accuracy",svm.score(X_train1, Y_train1))
print("testing accuracy",svm.score(X_test1, Y_test1))

In [None]:
# confusion matrix
SVM_conf_mat=confusion_matrix(Y_test1,Y_pred_SVM_test)
print(confusion_matrix(Y_test1,Y_pred_SVM_test))

#Visualization Confusion Matrix
plt.subplots(figsize=(5,5))
sns.heatmap(SVM_conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",data=X_test1)
plt.xlabel("ACUTAL Values")
plt.ylabel("PREDICTED Values")
plt.show()

In [None]:
print("## CLASSIFICATION REPORT SVM TRAINING MODEL ######")
print(classification_report(Y_train1,Y_pred_SVM_train))
print("## CLASSIFICATION REPORT SVM TESTING MODEL  ######")
print(classification_report(Y_test1,Y_pred_SVM_test))

In [None]:
from sklearn.metrics import roc_auc_score



# predict probabilities
probability_prediction_positive_svm = svm.predict_proba(X_test1)[:,1]

# auc scores
auc_score_svm = roc_auc_score(Y_test1, probability_prediction_positive_svm)


from sklearn.metrics import roc_curve

# roc curve for models
fpr2, tpr2, thresh2 = roc_curve(Y_test1, probability_prediction_positive_svm, pos_label=1)




print("AUC SCORE:",auc_score_svm)
#print("#######"*5)
#print("value of FPR", fpr1)
#print("#######"*5)
#print("Values of TPR",tpr1)
#print("#######"*5)
#print("values of Threshold", thresh1)

In [None]:
# matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn')

# plot roc curves
plt.plot(fpr2, tpr2, linestyle='--',color='orange', label='SVM')

# plot no skill roc curve
plt.plot([0, 1], [0, 1], linestyle='--', label='No Skill')

# title
plt.title('ROC curve')

# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.legend(loc='best')

plt.savefig('ROC', dpi=300)
plt.show()

# LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(max_iter=200)# by default max_iter=100. there is no much difference in accuracy changing it.
lr.fit(X_train1,Y_train1)

In [None]:
Y_pred_lr_train=lr.predict(X_train1)
Y_pred_lr_test=lr.predict(X_test1)

In [None]:
print("LOGISTIC REGRESSION TRAINING ACCURACY ",accuracy_score(Y_train1,Y_pred_lr_train))
print("######"*20)
print("LOGISTIC REGRESSION TESTING ACCURACY ",accuracy_score(Y_test1,Y_pred_lr_test))

# CONFUSION METRIX

In [None]:
lr_conf_mat=confusion_matrix(Y_test1,Y_pred_lr_test)
print(confusion_matrix(Y_test1,Y_pred_lr_test))

#Visualization Confusion Matrix
plt.subplots(figsize=(5,5))
sns.heatmap(lr_conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",data=X_test1)
plt.xlabel("ACUTAL Values")
plt.ylabel("PREDICTED Values")
plt.show()

In [None]:
print("## CLASSIFICATION REPORT DT TRAINING MODEL ######")
print(classification_report(Y_train1,Y_pred_lr_train))
print("## CLASSIFICATION REPORT DT TESTING MODEL  ######")
print(classification_report(Y_test1,Y_pred_lr_test))


# DECISION TREE

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier(criterion="entropy")

dt.fit(X_train1,Y_train1)

In [None]:
Y_pred_dt_train=dt.predict(X_train1)
Y_pred_dt_test=dt.predict(X_test1)

In [None]:
print("DECISION TREE CLASSIFIER TRAINING ACCURACY ",accuracy_score(Y_train1,Y_pred_dt_train))
print("######"*20)
print("DECISION TREE CLASSIFIER TESTING ACCURACY ",accuracy_score(Y_test1,Y_pred_dt_test))

In [None]:
from sklearn.tree import plot_tree
plt.figure(figsize=(20,15),dpi=150)
plot_tree(dt,filled=True,feature_names=X.columns)
plt.show()

In [None]:
# WE HAVE TO CREATE A USER DEFINED FUNCTION

In [None]:
def report_model(model):
    model_preds=model.predict(X_test1)
    print(classification_report(Y_test1,model_preds))
    print("\n")
    plt.figure(figsize=(12,8),dpi=150)
    plot_tree(model,filled=True,feature_names=X.columns)

In [None]:
#Hyperparameter in decision Tree

In [None]:
pruned_tree=DecisionTreeClassifier(max_depth=1)
pruned_tree.fit(X_train1,Y_train1)

In [None]:
# prunned model decision tree classifier with max branch =3 has created

In [None]:
report_model(pruned_tree)

In [None]:
pruned_tree5=DecisionTreeClassifier(max_depth=5)
pruned_tree5.fit(X_train1,Y_train1)

In [None]:
# prunned model decision tree classifier with max branch =5 has created

In [None]:
report_model(pruned_tree5)

In [None]:
# confusion matrix
dt_conf_mat=confusion_matrix(Y_test1,Y_pred_dt_test)
print(confusion_matrix(Y_test1,Y_pred_dt_test))

#Visualization Confusion Matrix
plt.subplots(figsize=(5,5))
sns.heatmap(dt_conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",data=X_test1)
plt.xlabel("ACUTAL Values")
plt.ylabel("PREDICTED Values")
plt.show()

In [None]:
print("## CLASSIFICATION REPORT DT TRAINING MODEL ######")
print(classification_report(Y_train1,Y_pred_dt_train))
print("## CLASSIFICATION REPORT DT TESTING MODEL  ######")
print(classification_report(Y_test1,Y_pred_dt_test))

In [None]:
from sklearn.metrics import roc_auc_score



# predict probabilities
probability_prediction_positive_dt = dt.predict_proba(X_test1)[:,1]

# auc scores
auc_score1_dt = roc_auc_score(Y_test1, probability_prediction_positive_dt)


from sklearn.metrics import roc_curve

# roc curve for models
fpr1_dt, tpr1_dt, thresh1_dt = roc_curve(Y_test1, probability_prediction_positive_dt, pos_label=1)




print("AUC SCORE:",auc_score1_dt)
#print("#######"*5)
#print("value of FPR", fpr1)
#print("#######"*5)
#print("Values of TPR",tpr1)
#print("#######"*5)
#print("values of Threshold", thresh1)

In [None]:
# matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn')

# plot roc curves
plt.plot(fpr1_dt, tpr1_dt, linestyle='--',color='orange', label='decision tree')

# plot no skill roc curve
plt.plot([0, 1], [0, 1], linestyle='--', label='No Skill')

# title
plt.title('ROC curve')

# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.legend(loc='best')

plt.savefig('ROC', dpi=300)
plt.show()

# FEATURE IMPORTANCE

In [None]:
dt.feature_importances_

In [None]:
pd.DataFrame(index=X.columns,data=dt.feature_importances_,columns=["Feature Importance"])

# Bagging model

In [None]:



from sklearn.ensemble import BaggingClassifier
# as it is a classification problem so we are importing classifier
#In case of regression problem  we use to import BaggingRegressor()

bagging=BaggingClassifier()
bagging.fit(X_train1,Y_train1)

In [None]:
Y_pred_BM_train=bagging.predict(X_train1)
Y_pred_BM_test=bagging.predict(X_test1)



In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [None]:
print("Bagging Model training accuracy", accuracy_score(Y_train1,Y_pred_BM_train))
print("######" *5)
print("Bagging Model testing accuracy", accuracy_score(Y_test1,Y_pred_BM_test))

In [None]:
print("CLASSIFICATION REPORT ON TRAINING  \n " , classification_report(Y_train1,Y_pred_BM_train))
print("######"*20)
print("CLASSIFICATION REPORT ON TEST  \n ",classification_report(Y_test1,Y_pred_BM_test))

In [None]:
BM_conf_mat=confusion_matrix(Y_test1,Y_pred_BM_test)
print(confusion_matrix(Y_train1,Y_pred_BM_train))
print("######"*20)
print(confusion_matrix(Y_test1,Y_pred_BM_test))

In [None]:
plt.subplots(figsize=(5,5))
sns.heatmap(BM_conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",data=X_test1)
plt.xlabel("ACUTAL Values")
plt.ylabel("PREDICTED Values")
plt.show()

In [None]:
print("BAGGING METHOD TRAINING ACCURACY",accuracy_score(Y_train1,Y_pred_BM_train))
print("######"*20)
print("BAGGING METHOD TESTING ACCURACY",accuracy_score(Y_test1,Y_pred_BM_test))

# RANDOM FOREST METHOD ENTROPY

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=50,criterion="entropy")


In [None]:
rf.fit(X_train1,Y_train1)

In [None]:
# RANDOM FOREST MODEL BUILDING COMPLETED.

In [None]:
Y_pred_rf_train=rf.predict(X_train1)
Y_pred_rf_test=rf.predict(X_test1)

In [None]:
print("RANDOM FOREST METHOD ENTROPY TRAINING ACCURACY ",accuracy_score(Y_train1,Y_pred_rf_train))
print("######"*20)
print("RANDOM FOREST METHOD ENTROPY TESTING ACCURACY ",accuracy_score(Y_test1,Y_pred_rf_test))

In [None]:
RF_conf_mat=confusion_matrix(Y_test1,Y_pred_rf_test)
print(confusion_matrix(Y_train1,Y_pred_rf_train))
print("######"*20)
print(confusion_matrix(Y_test1,Y_pred_rf_test))

plt.subplots(figsize=(5,5))
sns.heatmap(RF_conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",data=X_test1)
plt.xlabel("Predicted Values")
plt.ylabel("True Values")
plt.show()

In [None]:
print("## CLASSIFICATION REPORT RANDOM FOREST TRAINING MODEL ######")
print(classification_report(Y_train1,Y_pred_rf_train))
print("## CLASSIFICATION REPORT RANDOM FOREST TESTING MODEL  ######")
print(classification_report(Y_test1,Y_pred_rf_test))

In [None]:
# get Feature importance
importance = rf.feature_importances_


# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
    print(importance)

In [None]:
print('Feature: %0d, Score: %.5f' % (i,v))

In [None]:
# almost all features are significance some are more some are less.

In [None]:
from matplotlib import pyplot

In [None]:
#plot feature importance
#pyplot.bar([x for x in range(len(importance))], importance)
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X,Y)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(13).plot(kind='barh')
plt.show()

In [None]:
#The XGBoost library provides a built-in function to plot features ordered by their importance. 
#The function is called plot_importance ()

In [None]:
# plot feature importance using built-in function
from numpy import loadtxt
from xgboost import XGBClassifier
from xgboost import plot_importance

# fit model no training data

model = XGBClassifier(n_estimators=100)
model.fit(X, Y)
# plot feature importance
plot_importance(model,max_num_features=10) # top 10 most important features
pyplot.show()

# RANDOM FOREST WITH CRITERION GINI


In [None]:
rf1=RandomForestClassifier(n_estimators=200,criterion="gini")
rf1.fit(X_train1,Y_train1)

In [None]:
Y_pred_rf_ginni_train=rf1.predict(X_train1)
Y_pred_rf_ginni_test=rf1.predict(X_test1)

In [None]:
print("RANDOM FOREST METHOD GINI TRAINING ACCURACY ",accuracy_score(Y_train1,Y_pred_rf_ginni_train))
print("######"*20)
print("RANDOM FOREST METHOD ENTROPY TESTING ACCURACY ",accuracy_score(Y_test1,Y_pred_rf_ginni_test))

# Gaussian Naive Bayes model

In [None]:
# used in Medical diagnosis, Face Recognization,as a classifier, to identify faces and other features, weather prediction..
# training the model on training set
# probability approached based algorithm.
# widely used in text classification. 
#In real world is for normal industries not much in use in classification becoz we have many other models RF,DT,XG BOOST .

from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train1, Y_train1)
Y_pred_gnb_train=gnb.predict(X_train1) 
# making predictions on the testing set
Y_pred_gnb_test = gnb.predict(X_test1)
 
# comparing actual response values (y_test) with predicted response values (y_pred)
from sklearn import metrics
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(Y_test1, Y_pred_gnb_test)*100)

In [None]:
print("GAUSSIAN NAIVE BAYES METHOD TRAINING ACCURACY ",accuracy_score(Y_train1,Y_pred_gnb_train))
print("######"*20)
print("GAUSSIAN NAIVE BAYES METHOD TESTING ACCURACY ",accuracy_score(Y_test1,Y_pred_gnb_test))

# ADABOOST Adaptive Boosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
ab=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),n_estimators=100)


ab.fit(X_train1,Y_train1)

In [None]:
ab.score(X_train1,Y_train1)

In [None]:
ab.score(X_test1,Y_test1)

In [None]:
from sklearn.ensemble import RandomForestClassifier
ab1=AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=20,max_depth=2),n_estimators=100)

In [None]:
ab1.fit(X_train1,Y_train1)

In [None]:
ab1.score(X_train1,Y_train1)

In [None]:
ab1.score(X_test1,Y_test1)

# GRADIENT BOOSTING

In [None]:
#GradientBoostingRegressor. A similar algorithm is used for classification known as GradientBoostingClassifier.

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error as MSE

In [None]:
# Instantiate Gradient Boosting classifier
SEED = 2 # Setting SEED for reproducibility
gbt = GradientBoostingClassifier(n_estimators = 25, max_depth = 1, random_state = SEED)
 
# Fit to training set
gbt.fit(X_train1, Y_train1)

# Predict on train set
Y_pred_train_gbt = gbt.predict(X_train1)

# Predict on test set
Y_pred_test_gbt = gbt.predict(X_test1)
  
# test set RMSE
test_rmse = MSE(Y_test1, Y_pred_test_gbt) ** (1 / 2)
  
# Print rmse
print('RMSE test set: {:.2f}'.format(test_rmse))

In [None]:
gbt.score(X_test1,Y_test1)

In [None]:
gbt.score(X_train1,Y_train1)

In [None]:
#LEAST SQUARE LOSS

In [None]:
test_score=np.zeros(30,dtype=np.float64)
train_score=np.zeros(30,dtype=np.float64)
for i,Y_pred in enumerate(gbt.staged_predict(X_train1)):
    train_score[i]=gbt.loss_(Y_train1,Y_pred_train_gbt)
for i,Y_pred in enumerate(gbt.staged_predict(X_test1)):
    test_score[i]=gbt.loss_(Y_test1,Y_pred_test_gbt)

In [None]:
plt.plot(test_score)
plt.plot(train_score)
plt.xlabel("Iterators")
plt.ylabel("least square Loss")

In [None]:
def log_loss_prime(Y_test, Y_pred_test_gbt):
    return ((1 - Y_test1) / (1 - Y_pred_test_gbt) - Y_test1 / Y_pred_test_gbt) / np.size(Y_test1)

In [None]:
print(log_loss_prime(Y_test1, Y_pred_test_gbt))

In [None]:
from sklearn.metrics import log_loss
LogLoss = log_loss(Y_test1, Y_pred_test_gbt, eps = 1e-15,
normalize = True, sample_weight = None, labels = None)

In [None]:
LogLoss

In [None]:
#Log Loss is hard to interpret. A Log Loss of 0.69 may be good in a multiclass problem, but NOT GOOD in a binary biased case.

In [None]:
from sklearn.metrics import mean_squared_error
  
MSE = mean_squared_error(Y_test1, Y_pred_test_gbt)

In [None]:
MSE

# XG BOOST

In [None]:
pip install xgboost

In [None]:
from xgboost import XGBClassifier

In [None]:
from sklearn.model_selection import cross_val_score
scores=cross_val_score(XGBClassifier(objective="reg:squarederror"),X_test1,Y_test1)

In [None]:
scores

In [None]:
# rmse
rmse_score=(scores)**0.5
rmse_score.mean()