In [30]:
# import basic modules

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# ignore warnings

from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
# import the dataset

df = pd.read_csv('D:/Study/DataScience/Data/bloodtrabsfusion.csv')

df.head()

In [16]:
# define feature and target variables

X = df.iloc[:,:-1].values

y = df.iloc[:, -1].values

# split the dataset into traning and testing datasets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [46]:
# create pipeline for scaling the features and classification
# use GridSearch CV to also find best model and its parameters

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

classifiers = [
    LogisticRegression(),
    KNeighborsClassifier(n_neighbors=3),
    SVC(kernel="rbf", C=0.025, probability=True),
    RandomForestClassifier(),
    XGBClassifier()
    ]

for classifier in classifiers:
    pipe = Pipeline([('scaler', StandardScaler()),('imputer', SimpleImputer(strategy = 'median')), 
                     ('classifier', classifier)])
    pipe.fit(X_train, y_train)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test, y_test))
    print('\n')

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
model score: 0.742


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')
model score: 0.751


SVC(C=0.025, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
model score: 0.729


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
      

In [33]:
# find the best parameters of the model XGBoost

clf = XGBClassifier()

param_grid = {'max_depth' : [2,3,4],
             'learning_rate' : [0.01, 0.1, 1],
             'n_estimators' : [50, 75, 100, 125]}

grid = GridSearchCV(XGBClassifier(), param_grid = param_grid, cv = 5)

grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [2, 3, 4], 'learning_rate': [0.01, 0.1, 1], 'n_estimators': [50, 75, 100, 125]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [34]:
grid.best_params_

{'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 50}

In [35]:
grid.score(X_test, y_test)

0.7911111111111111

In [37]:
pipe1 = Pipeline([('scaler', StandardScaler()),('imputer', SimpleImputer(strategy = 'median')), 
                     ('classifier', XGBClassifier(n_estimators = 50, learning_rate = 0.01, max_depth = 4))])

pipe1.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)), ('classifier', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1,...
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1))])

In [38]:
pipe1.score(X_test, y_test)

0.7911111111111111

In [43]:
from sklearn.metrics import classification_report

y_pred = pipe1.predict(X_test)

print(classification_report(y_test, y_pred)) 

              precision    recall  f1-score   support

           1       0.82      0.91      0.86       164
           2       0.67      0.46      0.54        61

   micro avg       0.79      0.79      0.79       225
   macro avg       0.74      0.69      0.70       225
weighted avg       0.78      0.79      0.78       225

