In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn import svm

In [2]:
df = pd.read_csv('data-ori.csv')

df_subset = df[df['SOURCE'] == 'out'].sample(1000)
df = df.drop(df_subset.index)

df.head()

Unnamed: 0,HAEMATOCRIT,HAEMOGLOBINS,ERYTHROCYTE,LEUCOCYTE,THROMBOCYTE,MCH,MCHC,MCV,AGE,SEX,SOURCE
3,39.1,13.7,4.98,10.5,366,27.5,35.0,78.5,1,F,out
5,34.3,11.6,4.53,6.6,185,25.6,33.8,75.7,1,M,out
6,31.1,8.7,5.06,11.1,416,17.2,28.0,61.5,1,F,out
7,40.3,13.3,4.73,8.1,257,28.1,33.0,85.2,1,F,out
11,54.0,16.6,7.61,10.0,88,21.8,30.7,71.0,1,F,in


In [3]:
df['SOURCE'].value_counts()

SOURCE
in     1784
out    1628
Name: count, dtype: int64

In [4]:
from ydata_profiling import ProfileReport

profile = ProfileReport(df)
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [5]:
sex_encoding = pd.get_dummies(df['SEX'], dtype=float)
df = pd.concat([df, sex_encoding], axis=1)
df = df.drop('SEX', axis=1)
df

Unnamed: 0,HAEMATOCRIT,HAEMOGLOBINS,ERYTHROCYTE,LEUCOCYTE,THROMBOCYTE,MCH,MCHC,MCV,AGE,SOURCE,F,M
3,39.1,13.7,4.98,10.5,366,27.5,35.0,78.5,1,out,1.0,0.0
5,34.3,11.6,4.53,6.6,185,25.6,33.8,75.7,1,out,0.0,1.0
6,31.1,8.7,5.06,11.1,416,17.2,28.0,61.5,1,out,1.0,0.0
7,40.3,13.3,4.73,8.1,257,28.1,33.0,85.2,1,out,1.0,0.0
11,54.0,16.6,7.61,10.0,88,21.8,30.7,71.0,1,in,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4406,33.7,10.4,3.48,11.4,112,29.9,30.9,96.8,92,in,1.0,0.0
4407,32.8,10.4,3.49,8.1,72,29.8,31.7,94.0,92,in,1.0,0.0
4408,33.7,10.8,3.67,6.7,70,29.4,32.0,91.8,92,in,1.0,0.0
4410,31.5,10.4,3.15,9.1,187,33.0,33.0,100.0,98,in,1.0,0.0


In [14]:
y = df['SOURCE']
X = df.drop('SOURCE', axis=1)

sc = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

y_test

3790    out
3182     in
3343    out
3276     in
1127    out
       ... 
3698     in
1333    out
2080    out
3643    out
2318     in
Name: SOURCE, Length: 683, dtype: object

In [15]:
le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)
y_train

array([1, 1, 1, ..., 0, 1, 1])

In [16]:
# Define classifiers
classifiers = {
    'LR': LogisticRegression(random_state=42),
    'DT': DecisionTreeClassifier(random_state=42),
    'RF': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'SVM': svm.SVC(random_state=42),
    'XGB': XGBClassifier(random_state=42)
}

# Define pipelines using a dictionary comprehension
pipelines = {name: Pipeline([('scl', StandardScaler()), (name, clf)]) for name, clf in classifiers.items()}

# Define parameter grids
param_range = [1, 2, 3, 4, 5, 6]
param_range_fl = [1.0, 0.5, 0.1]
n_estimators = [50, 100, 150]
learning_rates = [0.1, 0.2, 0.3]

param_grids = {
    'LR': [{'LR__penalty': ['l1', 'l2'],
            'LR__C': param_range_fl,
            'LR__solver': ['liblinear']}],
    
    'DT': [{'DT__criterion': ['gini', 'entropy'],
            'DT__min_samples_leaf': param_range,
            'DT__max_depth': param_range,
            'DT__min_samples_split': param_range[1:]}],
    
    'RF': [{'RF__min_samples_leaf': param_range,
            'RF__max_depth': param_range,
            'RF__min_samples_split': param_range[1:]}],
    
    'KNN': [{'KNN__n_neighbors': param_range,
             'KNN__weights': ['uniform', 'distance'],
             'KNN__metric': ['euclidean', 'manhattan']}],
    
    'SVM': [{'SVM__kernel': ['linear', 'rbf'],
             'SVM__C': param_range}],
    
    'XGB': [{'XGB__learning_rate': learning_rates,
             'XGB__max_depth': [3, 4, 5],
             'XGB__min_child_weight': [1, 5, 10],
             'XGB__subsample': [0.6, 0.8, 1.0],
             'XGB__n_estimators': n_estimators}]
}

In [17]:
grids = []

# Example of looping over classifiers for grid search
for name, pipeline in pipelines.items():
    grid_search = GridSearchCV(estimator=pipeline,
                               param_grid=param_grids[name],
                               scoring='accuracy', # or any other scoring
                               cv=10)
    # Add your fit, scoring, etc. here
    grids.append(grid_search)

In [18]:
for pipe in grids:
    pipe.fit(X_train,y_train)

In [32]:
from sklearn.metrics import classification_report

for model in grids:
    print(model.estimator[1])
    y_pred = model.predict(X_test)
    print(model.best_params_)
    print(classification_report(y_test, y_pred))
    print()

LogisticRegression(random_state=42)
{'LR__C': 1.0, 'LR__penalty': 'l2', 'LR__solver': 'liblinear'}
              precision    recall  f1-score   support

           0       0.70      0.74      0.72       358
           1       0.69      0.65      0.67       325

    accuracy                           0.69       683
   macro avg       0.69      0.69      0.69       683
weighted avg       0.69      0.69      0.69       683


DecisionTreeClassifier(random_state=42)
{'DT__criterion': 'gini', 'DT__max_depth': 4, 'DT__min_samples_leaf': 6, 'DT__min_samples_split': 2}
              precision    recall  f1-score   support

           0       0.75      0.71      0.73       358
           1       0.70      0.74      0.72       325

    accuracy                           0.73       683
   macro avg       0.73      0.73      0.73       683
weighted avg       0.73      0.73      0.73       683


RandomForestClassifier(random_state=42)
{'RF__max_depth': 6, 'RF__min_samples_leaf': 2, 'RF__min_samples