In [23]:
import numpy as np
import matplotlib.pyplot as plt
import joblib

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier 
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from scipy.stats import randint

np.random.seed(42)

# Data

In [5]:
mnist = fetch_openml('mnist_784', version=1, cache=True, as_frame=False)
# print(mnist.DESCR)

X = mnist["data"]
y = mnist["target"].astype(np.uint8)


In [None]:
# Dela upp datan
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10000, random_state=42)

# Skapa och träna StandardScaler på träningsdatan
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# Skala testdatan med samma StandardScaler-objekt
X_test = scaler.transform(X_test)

# Modelling

In [14]:
rf = RandomForestClassifier(random_state=42)

param_grid_rf = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5]
}

grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=3, n_jobs=-1, verbose=1)
grid_search_rf.fit(X_train, y_train)

best_rf = grid_search_rf.best_estimator_

best_score = grid_search_rf.best_score_
print(f"Bästa score: {best_score}")

Fitting 3 folds for each of 8 candidates, totalling 24 fits
Bästa score: 0.9643200487724316


In [13]:
# Definiera parameterutrymme
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(5, 20),
    'min_samples_split': randint(2, 10)
}

# Skapa modell
rf = RandomForestClassifier(random_state=42)

# Skapa RandomizedSearchCV
random_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=50, cv=3, random_state=42, n_jobs=-1)

# Träna modellen
random_search.fit(X_train, y_train)

# Hämta bästa estimator
best_rf = random_search.best_estimator_

# Hämta bästa score
best_score = random_search.best_score_

# Printa ut bästa score
print(f"Bästa score: {best_score}")

Bästa score: 0.9638000459719595


In [17]:
et = ExtraTreesClassifier(random_state=42)

param_grid_et = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5]
}

grid_search_et = GridSearchCV(et, param_grid_et, cv=3, n_jobs=-1, verbose=1)
grid_search_et.fit(X_train, y_train)

best_et = grid_search_et.best_estimator_

best_score = grid_search_et.best_score_
print(f"Bästa score: {best_score}")

Fitting 3 folds for each of 8 candidates, totalling 24 fits
Bästa score: 0.9660800507738795


In [18]:
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

param_grid_xgb = {
    'n_estimators': [50, 100],
    'max_depth': [3, 6],
    'learning_rate': [0.01, 0.1]
}

grid_search_xgb = GridSearchCV(xgb, param_grid_xgb, cv=3, n_jobs=-1, verbose=1)
grid_search_xgb.fit(X_train, y_train)

best_xgb = grid_search_xgb.best_estimator_

best_score = grid_search_xgb.best_score_
print(f"Bästa score: {best_score}")

Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.



Bästa score: 0.9643000455723515


In [19]:
voting_clf = VotingClassifier(estimators=[
    ('rf', best_rf),
    ('et', best_et),
    ('xgb', best_xgb)
], voting='soft')

voting_clf.fit(X_train, y_train)
y_pred_voting = voting_clf.predict(X_test)

print("Voting Classifier Results:")
print(classification_report(y_test, y_pred_voting))
print(confusion_matrix(y_test, y_pred_voting))

Parameters: { "use_label_encoder" } are not used.



Voting Classifier Results:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       983
           1       0.98      0.99      0.98      1152
           2       0.96      0.98      0.97       967
           3       0.97      0.96      0.96      1034
           4       0.97      0.97      0.97       906
           5       0.98      0.97      0.97       937
           6       0.98      0.98      0.98       961
           7       0.97      0.97      0.97      1055
           8       0.96      0.95      0.96       969
           9       0.96      0.95      0.95      1036

    accuracy                           0.97     10000
   macro avg       0.97      0.97      0.97     10000
weighted avg       0.97      0.97      0.97     10000

[[ 969    0    2    0    1    1    2    1    6    1]
 [   0 1136    5    5    1    0    0    2    2    1]
 [   1    2  945    1    3    0    3    5    6    1]
 [   1    0   16  988    0    4    1    7    7   10]
 [

In [22]:
# # Välj den bästa modellen
# best_model = best_et

# # Träna om modellen på all träningsdata
# best_model.fit(X_train, y_train)

# # Prediktera på testdatan
# y_pred_final = best_model.predict(X_test)

# # Utvärdera den slutliga modellen
# print("Final Model Results:")
# print(classification_report(y_test, y_pred_final))
# print(confusion_matrix(y_test, y_pred_final))

# Använd VotingClassifier-resultaten
y_pred_final = y_pred_voting

# Utvärdera VotingClassifier-modellen
print("Final Voting Classifier Results:")
print(classification_report(y_test, y_pred_final))
print(confusion_matrix(y_test, y_pred_final))

Final Voting Classifier Results:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       983
           1       0.98      0.99      0.98      1152
           2       0.96      0.98      0.97       967
           3       0.97      0.96      0.96      1034
           4       0.97      0.97      0.97       906
           5       0.98      0.97      0.97       937
           6       0.98      0.98      0.98       961
           7       0.97      0.97      0.97      1055
           8       0.96      0.95      0.96       969
           9       0.96      0.95      0.95      1036

    accuracy                           0.97     10000
   macro avg       0.97      0.97      0.97     10000
weighted avg       0.97      0.97      0.97     10000

[[ 969    0    2    0    1    1    2    1    6    1]
 [   0 1136    5    5    1    0    0    2    2    1]
 [   1    2  945    1    3    0    3    5    6    1]
 [   1    0   16  988    0    4    1    7    7   

In [25]:
# Spara VotingClassifier-modellen
joblib.dump(voting_clf, 'mnist_final_voting_classifier_v1.joblib')

['mnist_final_voting_classifier_v1.joblib']