In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer # tri druhy skalovani
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold # pouziti cross validace
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.metrics import classification_report, f1_score
import pickle

In [6]:
data = pd.read_csv("../data/final_dataset2.csv")
y = data['art_annotation']

X = data.drop('art_annotation', axis=1).drop('icp_annotation', axis=1).drop('StartTime', axis=1).drop('EndTime', axis=1)

X_train=X.iloc[:6000]
X_test=X.iloc[6000:]
y_train=y.iloc[:6000]
y_test=y.iloc[6000:]

In [7]:
pipe_mm_knn5 = Pipeline([('scaler', MinMaxScaler()),('classifier', KNeighborsClassifier(n_neighbors=5))])
pipe_mm_xcb = Pipeline([('scaler', MinMaxScaler()),('classifier', xgb.XGBClassifier())]) 
pipes = {"pipe_mm_XGB":pipe_mm_xcb,"pipe_mm_knn5":pipe_mm_knn5}

In [9]:
# pro kazdou rouru si budeme drzet vysledky
results = { pipe_name: [] for pipe_name in pipes.keys()}
# trenovaci mnozinu budeme delit dale na 5 podmnozin
kf = KFold(n_splits=5, shuffle=True)
for train_index, test_index in kf.split(X_train, y_train): # vraci dvojici poli testovacich a trenovacich indexu
  # rozdel si data na trenovaci a na data, na kterych bude ohodnocen klasifikator
  X_fold_tr = X_train.values[train_index]
  y_fold_tr = y_train[train_index]
  X_fold_test = X_train.values[test_index]
  y_fold_test = y_train[test_index]
  for k, pipe in pipes.items(): # pro kazdou pipe, delej
    pipe.fit(X_fold_tr, y_fold_tr) # nauc
    results[k].append(pipe.score(X_fold_test,y_fold_test)) # uloz si accuracy
    print(train_index,k)
#udelej si dataframe pro zhodnoceni
results = pd.DataFrame(data = results)
print(results) # tiskni za jednoltive foldy uspesnost klasifikatoru
print(results.mean()) # tiskni prumer

f1_results = {}

# Add the classification report and F1-score
for k, pipe in pipes.items():
    pipe.fit(X_train, y_train)
    y_pred_test = pipe.predict(X_test)
    f1 = f1_score(y_test, y_pred_test)  # You can change 'weighted' based on your requirement
    f1_results[k] = f1  # Store the F1-score
    
    print(f"Classification Report for {k}")
    print(classification_report(y_test, y_pred_test))
    print(f"F1-Score for {k}: {f1}")

# Convert F1-score results to DataFrame for better visualization
f1_results_df = pd.DataFrame(list(f1_results.items()), columns=['Pipeline', 'F1-Score'])
print(f1_results_df)

  if is_sparse(data):


[   0    1    3 ... 5996 5998 5999] pipe_mm_XGB
[   0    1    3 ... 5996 5998 5999] pipe_mm_knn5


  if is_sparse(data):


[   0    1    2 ... 5995 5996 5997] pipe_mm_XGB
[   0    1    2 ... 5995 5996 5997] pipe_mm_knn5


  if is_sparse(data):


[   0    2    5 ... 5997 5998 5999] pipe_mm_XGB
[   0    2    5 ... 5997 5998 5999] pipe_mm_knn5


  if is_sparse(data):


[   0    1    2 ... 5997 5998 5999] pipe_mm_XGB
[   0    1    2 ... 5997 5998 5999] pipe_mm_knn5


  if is_sparse(data):


[   1    2    3 ... 5997 5998 5999] pipe_mm_XGB
[   1    2    3 ... 5997 5998 5999] pipe_mm_knn5
   pipe_mm_XGB  pipe_mm_knn5
0     0.973333      0.971667
1     0.964167      0.961667
2     0.979167      0.983333
3     0.966667      0.971667
4     0.971667      0.971667
pipe_mm_XGB     0.971
pipe_mm_knn5    0.972
dtype: float64


  if is_sparse(data):


Classification Report for pipe_mm_XGB
              precision    recall  f1-score   support

           0       0.87      0.92      0.90      1525
           1       0.70      0.57      0.63       487

    accuracy                           0.84      2012
   macro avg       0.78      0.75      0.76      2012
weighted avg       0.83      0.84      0.83      2012

F1-Score for pipe_mm_XGB: 0.6274065685164214
Classification Report for pipe_mm_knn5
              precision    recall  f1-score   support

           0       0.88      0.98      0.93      1525
           1       0.90      0.59      0.71       487

    accuracy                           0.88      2012
   macro avg       0.89      0.78      0.82      2012
weighted avg       0.89      0.88      0.87      2012

F1-Score for pipe_mm_knn5: 0.7098381070983809
       Pipeline  F1-Score
0   pipe_mm_XGB  0.627407
1  pipe_mm_knn5  0.709838


In [13]:
for k, pipe in pipes.items():
    filename = f'finalized_model_{k}.sav'
    pickle.dump(pipe, open(filename, 'wb'))