In [1]:
#Import all necessary modules 
import numpy as  np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

In [2]:
#load dataset
df=pd.read_csv('patient.csv')

In [3]:
df.head()

Unnamed: 0,HAEMATOCRIT,HAEMOGLOBINS,ERYTHROCYTE,LEUCOCYTE,THROMBOCYTE,MCH,MCHC,MCV,AGE,SEX,SOURCE
0,35.1,11.8,4.65,6.3,310,25.4,33.6,75.5,1,F,out
1,43.5,14.8,5.39,12.7,334,27.5,34.0,80.7,1,F,out
2,33.5,11.3,4.74,13.2,305,23.8,33.7,70.7,1,F,out
3,39.1,13.7,4.98,10.5,366,27.5,35.0,78.5,1,F,out
4,30.9,9.9,4.23,22.1,333,23.4,32.0,73.0,1,M,out


# Pre-processing part

In [4]:
#check for null values
df.isnull().sum()

HAEMATOCRIT     0
HAEMOGLOBINS    0
ERYTHROCYTE     0
LEUCOCYTE       0
THROMBOCYTE     0
MCH             0
MCHC            0
MCV             0
AGE             0
SEX             0
SOURCE          0
dtype: int64

In [5]:
#check for duplicate values
df.duplicated().sum()

0

In [6]:
#check datatypes
df.dtypes

HAEMATOCRIT     float64
HAEMOGLOBINS    float64
ERYTHROCYTE     float64
LEUCOCYTE       float64
THROMBOCYTE       int64
MCH             float64
MCHC            float64
MCV             float64
AGE               int64
SEX              object
SOURCE           object
dtype: object

In [7]:
#we need to apply Label encoder to 'sex','source' column to convert them from object type to integer type

In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
le=LabelEncoder()

In [10]:
df['SEX']=le.fit_transform(df['SEX'])

In [11]:
df['SOURCE']=le.fit_transform(df['SOURCE'])

In [12]:
#cross check
df.head()

Unnamed: 0,HAEMATOCRIT,HAEMOGLOBINS,ERYTHROCYTE,LEUCOCYTE,THROMBOCYTE,MCH,MCHC,MCV,AGE,SEX,SOURCE
0,35.1,11.8,4.65,6.3,310,25.4,33.6,75.5,1,0,1
1,43.5,14.8,5.39,12.7,334,27.5,34.0,80.7,1,0,1
2,33.5,11.3,4.74,13.2,305,23.8,33.7,70.7,1,0,1
3,39.1,13.7,4.98,10.5,366,27.5,35.0,78.5,1,0,1
4,30.9,9.9,4.23,22.1,333,23.4,32.0,73.0,1,1,1


In [13]:
#select input and output variable
x=df.drop('SOURCE',axis=1) #input variable
y=df['SOURCE'] #output variable

In [14]:
#spit data into 2 parts ie. trianing(70%) and testing(30%)
from sklearn.model_selection import train_test_split

In [15]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=1)

In [16]:
#apply scalling on input data
#import standard scaller
from sklearn.preprocessing import StandardScaler

In [17]:
ss=StandardScaler()
x_train=ss.fit_transform(x_train)#converting into numpy array
x_test=ss.transform(x_test)

In [58]:
# check wheather the data is balance or not
# first check ytrain(0 and 1) :how many sample of o's and 1's
y_train.value_counts()

1    1834
0    1254
Name: SOURCE, dtype: int64

In [59]:
#difference is less than 50%
#hence no need to apply sampling technique

# Create model function

In [20]:
#create model to perform ML algorithms
def create_model(model):
    model.fit(x_train,y_train)# train model with 70% data
    y_pred=model.predict(x_test)#test with 30% data
    print(classification_report(y_test,y_pred))
    print(confusion_matrix(y_test,y_pred))
    return model
#import classification_report,confusion_matrix
from sklearn.metrics import classification_report,confusion_matrix

# Using Logistic Regression

In [22]:
#import logistic regression module
from sklearn.linear_model import LogisticRegression

In [23]:
lr=LogisticRegression(random_state=1)
lr=create_model(lr)

              precision    recall  f1-score   support

           0       0.68      0.53      0.60       530
           1       0.73      0.83      0.78       794

    accuracy                           0.71      1324
   macro avg       0.70      0.68      0.69      1324
weighted avg       0.71      0.71      0.70      1324

[[283 247]
 [135 659]]


In [24]:
#with Logistic Regression we are got f1 score 0=60% 1=78%

# Using Decision Tree Clasifier

# Gini-index method

In [27]:
#create meodel using DecisionTree Classifier
#import logistic regression module
from sklearn.tree import DecisionTreeClassifier

In [28]:
dt=DecisionTreeClassifier(random_state=1)
dt=create_model(dt)

              precision    recall  f1-score   support

           0       0.57      0.59      0.58       530
           1       0.72      0.70      0.71       794

    accuracy                           0.66      1324
   macro avg       0.65      0.65      0.65      1324
weighted avg       0.66      0.66      0.66      1324

[[315 215]
 [238 556]]


In [29]:
#with decision tree we got f1 score of 0=58% & 1=71%

In [30]:
#check information gain : 
dict={'Input':x.columns,'IG':dt.feature_importances_} 
df1=pd.DataFrame(dict)
#sorting in decending order 
df1.sort_values('IG',ascending=False ,ignore_index=True)

Unnamed: 0,Input,IG
0,THROMBOCYTE,0.243162
1,HAEMATOCRIT,0.151713
2,LEUCOCYTE,0.135892
3,AGE,0.106988
4,ERYTHROCYTE,0.086722
5,MCH,0.07401
6,MCHC,0.071259
7,MCV,0.068496
8,HAEMOGLOBINS,0.046866
9,SEX,0.014891


In [31]:
#to improve our f1score we apply puring technique
#1)max_depth technique
#2)min_samples_leaf technique

In [34]:
# using max_depth technique
'''for i in range(1,9):
    dt1=DecisionTreeClassifier(random_state=1,max_depth=i)
    print('max_depth:',i)
    dt1=create_model(dt1)
#heat and trial method'''

"for i in range(1,9):\n    dt1=DecisionTreeClassifier(random_state=1,max_depth=i)\n    print('max_depth:',i)\n    dt1=create_model(dt1)\n#heat and trial method"

In [35]:
dt=DecisionTreeClassifier(random_state=1,max_depth=8)
dt=create_model(dt)

              precision    recall  f1-score   support

           0       0.64      0.58      0.61       530
           1       0.74      0.78      0.76       794

    accuracy                           0.70      1324
   macro avg       0.69      0.68      0.68      1324
weighted avg       0.70      0.70      0.70      1324

[[310 220]
 [177 617]]


In [36]:
#with max depth technique we are got f1 score 0=61% 1=76%

In [40]:
#second puring technique: min_sampleleaf
#min_samples_leaf >=45 and<=100
'''for i in range(45,101,1): #i=45 i=47,.....
 dt2=DecisionTreeClassifier(random_state=1,min_samples_leaf=i)
 #by default gini index
 print('min_samples_leaf :',i)
 #call function
 dt2=create_model(dt2)'''

"for i in range(45,101,1): #i=45 i=47,.....\n dt2=DecisionTreeClassifier(random_state=1,min_samples_leaf=i)\n #by default gini index\n print('min_samples_leaf :',i)\n #call function\n dt2=create_model(dt2)"

In [41]:
dt2=DecisionTreeClassifier(random_state=1,min_samples_leaf=98)
dt2=create_model(dt2)

              precision    recall  f1-score   support

           0       0.67      0.56      0.61       530
           1       0.74      0.82      0.78       794

    accuracy                           0.72      1324
   macro avg       0.70      0.69      0.69      1324
weighted avg       0.71      0.72      0.71      1324

[[296 234]
 [143 651]]


In [42]:
#with min_samples_leaf technique we got f1 score 0=61% 1=78%

# Entropy method

In [43]:
dte=DecisionTreeClassifier(random_state=1,criterion='entropy')
dte=create_model(dte)

              precision    recall  f1-score   support

           0       0.58      0.62      0.60       530
           1       0.73      0.69      0.71       794

    accuracy                           0.67      1324
   macro avg       0.65      0.66      0.66      1324
weighted avg       0.67      0.67      0.67      1324

[[330 200]
 [243 551]]


In [44]:
#we got f1 score of 0=60% & 1=71%

In [45]:
#to improve our f1score we apply puring technique
#1)max_depth technique
#2)min_samples_leaf technique

In [53]:
'''for i in range(1,9):
    dte1=DecisionTreeClassifier(random_state=1,max_depth=i,criterion='entropy')
    print('max_depth:',i)
    dte1=create_model(dte1)'''

"for i in range(1,9):\n    dte1=DecisionTreeClassifier(random_state=1,max_depth=i,criterion='entropy')\n    print('max_depth:',i)\n    dte1=create_model(dte1)"

In [49]:
dte1=DecisionTreeClassifier(random_state=1,max_depth=5,criterion='entropy')
dte1=create_model(dte1)

              precision    recall  f1-score   support

           0       0.69      0.54      0.61       530
           1       0.73      0.83      0.78       794

    accuracy                           0.72      1324
   macro avg       0.71      0.69      0.69      1324
weighted avg       0.71      0.72      0.71      1324

[[288 242]
 [132 662]]


In [50]:
#with max depth technique we are got drop in f1 score 0=61% 1=78%

In [55]:
#second puring technique: min_sampleleaf
#min_samples_leaf >=45 and<=100
'''for i in range(45,101,1): #i=45 i=47,.....
 dte2=DecisionTreeClassifier(random_state=1,min_samples_leaf=i,criterion='entropy')
 #by default gini index
 print('min_samples_leaf :',i)
 #call function
 dte2=create_model(dte2)'''

"for i in range(45,101,1): #i=45 i=47,.....\n dte2=DecisionTreeClassifier(random_state=1,min_samples_leaf=i,criterion='entropy')\n #by default gini index\n print('min_samples_leaf :',i)\n #call function\n dte2=create_model(dte2)"

In [54]:
dte2=DecisionTreeClassifier(random_state=1,min_samples_leaf=45,criterion='entropy')
dte2=create_model(dte2)

              precision    recall  f1-score   support

           0       0.65      0.60      0.63       530
           1       0.75      0.79      0.77       794

    accuracy                           0.71      1324
   macro avg       0.70      0.69      0.70      1324
weighted avg       0.71      0.71      0.71      1324

[[320 210]
 [170 624]]


# Using RandomForest

In [57]:
#crate model using Random Forest
from sklearn.ensemble import RandomForestClassifier

In [62]:
''''for i in range(10,101):
    rfc=RandomForestClassifier(n_estimators=i,random_state=1)
    print('no. of decision tree:',i)
    rfc=create_model(rfc)'''

"'for i in range(10,101):\n    rfc=RandomForestClassifier(n_estimators=i,random_state=1)\n    print('no. of decision tree:',i)\n    rfc=create_model(rfc)"

In [61]:
#we choose no_decisiontree=25(n_estimator)
rfc=RandomForestClassifier(random_state=1,n_estimators=25)
rfc=create_model(rfc)

              precision    recall  f1-score   support

           0       0.70      0.63      0.66       530
           1       0.77      0.82      0.79       794

    accuracy                           0.74      1324
   macro avg       0.73      0.72      0.73      1324
weighted avg       0.74      0.74      0.74      1324

[[333 197]
 [144 650]]


In [63]:
#with Random Forest we got f1 score of 0=66% and 1=79%

In [64]:
#check information gain : 
dict={'Input':x.columns,'IG':rfc.feature_importances_} 
df1=pd.DataFrame(dict)
#sorting in decending order 
df1.sort_values('IG',ascending=False ,ignore_index=True)

Unnamed: 0,Input,IG
0,THROMBOCYTE,0.2231
1,LEUCOCYTE,0.120298
2,HAEMATOCRIT,0.117978
3,ERYTHROCYTE,0.108363
4,AGE,0.094434
5,HAEMOGLOBINS,0.0886
6,MCV,0.080516
7,MCHC,0.076897
8,MCH,0.072695
9,SEX,0.01712


In [71]:
#apply pruning technique 1:max_depth method
'''for i in range (1,9):
     rfc1=RandomForestClassifier(n_estimators=25,random_state=1,max_depth=i)
     print('max depth',i)
     rfc1=create_model(rfc1)'''

"for i in range (1,9):\n     rfc1=RandomForestClassifier(n_estimators=25,random_state=1,max_depth=i)\n     print('max depth',i)\n     rfc1=create_model(rfc1)"

In [67]:
rfc1=RandomForestClassifier(n_estimators=25,random_state=1,max_depth=8)
rfc1=create_model(rfc1)

              precision    recall  f1-score   support

           0       0.69      0.59      0.63       530
           1       0.75      0.82      0.78       794

    accuracy                           0.73      1324
   macro avg       0.72      0.71      0.71      1324
weighted avg       0.72      0.73      0.72      1324

[[313 217]
 [143 651]]


In [68]:
#we got f1 score of 0=63% and 1=78%

In [74]:
#apply second pruning technique ie. min_sample_leaf(>=45 and<=100)
'''for i in range (45,100):
     rfc2=RandomForestClassifier(n_estimators=25,random_state=1,min_samples_leaf=i)
     print('minsampeles_leaf',i)
     rfc2=create_model(rfc2)'''

"for i in range (45,100):\n     rfc2=RandomForestClassifier(n_estimators=25,random_state=1,min_samples_leaf=i)\n     print('minsampeles_leaf',i)\n     rfc2=create_model(rfc2)"

In [73]:
rfc2=RandomForestClassifier(n_estimators=25,random_state=1,min_samples_leaf=49)
rfc2=create_model(rfc2)

              precision    recall  f1-score   support

           0       0.66      0.60      0.63       530
           1       0.75      0.79      0.77       794

    accuracy                           0.71      1324
   macro avg       0.70      0.70      0.70      1324
weighted avg       0.71      0.71      0.71      1324

[[318 212]
 [166 628]]


In [75]:
#we got f1 score of 0=63% and 1=77%

# Using Adaboost Technique

In [76]:
# create model using Adaboosting technique
from sklearn.ensemble import AdaBoostClassifier

In [112]:
'''for i in range(1,11):#as we have 10 input varaiables
    ada=AdaBoostClassifier(n_estimators=i,random_state=1)
    print('no. of decions stump',i)
    ada=create_model(ada)'''

"for i in range(1,11):#as we have 10 input varaiables\n    ada=AdaBoostClassifier(n_estimators=i,random_state=1)\n    print('no. of decions stump',i)\n    ada=create_model(ada)"

In [80]:
ada=AdaBoostClassifier(n_estimators=10,random_state=1)
ada=create_model(ada)

              precision    recall  f1-score   support

           0       0.69      0.58      0.63       530
           1       0.75      0.83      0.78       794

    accuracy                           0.73      1324
   macro avg       0.72      0.70      0.71      1324
weighted avg       0.72      0.73      0.72      1324

[[307 223]
 [138 656]]


In [82]:
#with Adaboost we got f1 score of  0=63% and 1=78%

# Using Gradient Boost

In [84]:
from sklearn.ensemble import  GradientBoostingClassifier

In [87]:
'''for i in range(10,101):
    #create the object of gradientboostclassier class
    gbc= GradientBoostingClassifier(n_estimators=i,random_state=1)
    print('No of estimators :',i)
    #call function
    gbc=create_model(gbc)'''

"for i in range(10,101):\n    #create the object of gradientboostclassier class\n    gbc= GradientBoostingClassifier(n_estimators=i,random_state=1)\n    print('No of estimators :',i)\n    #call function\n    gbc=create_model(gbc)"

In [86]:
gbc= GradientBoostingClassifier(n_estimators=65,random_state=1)
gbc=create_model(gbc)

              precision    recall  f1-score   support

           0       0.70      0.62      0.66       530
           1       0.76      0.82      0.79       794

    accuracy                           0.74      1324
   macro avg       0.73      0.72      0.72      1324
weighted avg       0.74      0.74      0.74      1324

[[326 204]
 [139 655]]


In [88]:
#with gradient boosting we got f1 score of 0=66% and 1=79%

# Using xtreame gradient boost

In [111]:
from xgboost import XGBClassifier
'''for i in range(10,101):
 #create the object of XGBclassier class
 xgc= XGBClassifier(n_estimators=i,random_state=1,reg_alpha=1)
 print('No of estimators :',i)
 #call function
 xgc=create_model(gbc)'''

"for i in range(10,101):\n #create the object of XGBclassier class\n xgc= XGBClassifier(n_estimators=i,random_state=1,reg_alpha=1)\n print('No of estimators :',i)\n #call function\n xgc=create_model(gbc)"

In [91]:
xgc= XGBClassifier(n_estimators=10,random_state=1,reg_alpha=1)
xgc=create_model(xgc)



              precision    recall  f1-score   support

           0       0.70      0.60      0.65       530
           1       0.76      0.82      0.79       794

    accuracy                           0.74      1324
   macro avg       0.73      0.71      0.72      1324
weighted avg       0.73      0.74      0.73      1324

[[320 210]
 [139 655]]


In [92]:
# using xtreame boosting we got f1 score of 0=65% and 1=79%

# Using KNN method

In [93]:
from sklearn.neighbors import KNeighborsClassifier
#create object of KNeighnors classifier
knc=KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)
#p=2 means Eucidean distance method

In [94]:
#call function
knc=create_model(knc)

              precision    recall  f1-score   support

           0       0.67      0.62      0.64       530
           1       0.76      0.80      0.78       794

    accuracy                           0.73      1324
   macro avg       0.71      0.71      0.71      1324
weighted avg       0.72      0.73      0.72      1324

[[328 202]
 [161 633]]


In [95]:
# using KNN we got f1 score of 0=64% and 1=78%

# Using SVM algorithum

In [96]:
from sklearn.svm import LinearSVC
# Create object of linearSVM class
svc= LinearSVC(random_state=1)
# we have not added any error for our outliers so we will get a line which will be called hard line
# we call the function:
svc = create_model(svc)

              precision    recall  f1-score   support

           0       0.67      0.51      0.58       530
           1       0.72      0.84      0.77       794

    accuracy                           0.70      1324
   macro avg       0.70      0.67      0.67      1324
weighted avg       0.70      0.70      0.69      1324

[[268 262]
 [130 664]]




In [103]:
#suppose outlier in our dataset means model is overfitted   
#so reduce overfit column or remove outlier by adding eternal erro at training time
#means again create object of class of  svc and pass charecter 'C' means error
# the value of c can be <=1
svc1=LinearSVC(random_state=1,C=0.9)# soft margine
svc1=create_model(svc1)

              precision    recall  f1-score   support

           0       0.67      0.51      0.58       530
           1       0.72      0.84      0.77       794

    accuracy                           0.70      1324
   macro avg       0.70      0.67      0.67      1324
weighted avg       0.70      0.70      0.69      1324

[[268 262]
 [130 664]]




In [104]:
# even after changing value of 'c' we got same output 
#hence it is non linear dataset

In [105]:
#polynomial kernal function
from sklearn.svm import SVC
poly_svc=SVC(random_state=1,kernel='poly')
poly_svc=create_model(poly_svc)

              precision    recall  f1-score   support

           0       0.70      0.49      0.58       530
           1       0.72      0.86      0.78       794

    accuracy                           0.71      1324
   macro avg       0.71      0.67      0.68      1324
weighted avg       0.71      0.71      0.70      1324

[[258 272]
 [109 685]]


In [107]:
#using radial function
radial_svc=SVC(random_state=1,kernel='rbf')
radial_svc=create_model(radial_svc)

              precision    recall  f1-score   support

           0       0.73      0.58      0.65       530
           1       0.75      0.86      0.80       794

    accuracy                           0.75      1324
   macro avg       0.74      0.72      0.72      1324
weighted avg       0.74      0.75      0.74      1324

[[307 223]
 [113 681]]


In [108]:
# using SVM(radial svc) we got f1 score of 0=65% and 1=80%

# Using Naive Bayes Theorem

In [109]:
from sklearn.naive_bayes import GaussianNB
# Creating the object of Gaussian Class
gnb = GaussianNB()
# Calling the function
gnb = create_model(gnb)

              precision    recall  f1-score   support

           0       0.63      0.51      0.56       530
           1       0.71      0.80      0.75       794

    accuracy                           0.68      1324
   macro avg       0.67      0.65      0.66      1324
weighted avg       0.68      0.68      0.68      1324

[[269 261]
 [157 637]]


In [110]:
#with naive bayes we got  f1 score of 0=88% and 1=84%

# Conclusion:
We found that for this dataset SVM(Radial SVC) algo gives best f1 score ie 0=65% 1=80%