In [50]:
import pandas as pd
import os

from sklearn.metrics import accuracy_score
from joblib import dump, load
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
RANDOM_STATE = 42

# Loading data

In [51]:
df = pd.read_csv("../data/processed/data.csv")

X = df.drop(['PATHOLOGY'], axis=1)
y = df['PATHOLOGY']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

# Logistic regression

In [64]:
parameters = {'C': [0.01, 0.1, 1, 10]}

logreg_model = LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)
logreg_model = GridSearchCV(logreg_model, parameters)
logreg_model.fit(X_train, y_train)

print("Best param :", logreg_model.best_params_)
print("Accuracy :",logreg_model.best_score_)

# 25 minutes

Best param : {'C': 1}
Accuracy : 0.988988601633593


In [67]:
logreg_model = LogisticRegression(random_state=RANDOM_STATE, max_iter=1000, C=1)
logreg_model.fit(X_train, y_train)

pred = logreg_model.predict(X_test)
accuracy_score(y_test, pred)

0.9888633973807476

In [68]:
dump(logreg_model, "../models/logreg_model.joblib");

In [76]:
size = os.path.getsize("../models/logreg_model.joblib") / 1024 / 1024
print("File size:", round(size, 3), "mb")

File size: 0.083 mb


# Decision tree

In [53]:
parameters = {'max_depth' : [20, 40, 60, 80, 100, 120, 140]}
tree_model = DecisionTreeClassifier(random_state=RANDOM_STATE)
tree_model = GridSearchCV(tree_model, parameters)
tree_model.fit(X_train, y_train)

print("Best param :", tree_model.best_params_)
print("Accuracy :",tree_model.best_score_)

Best param : {'max_depth': 100}
Accuracy : 0.9792079932022191


In [54]:
parameters = {'max_depth' : [85, 90, 95, 100, 105, 110]}
tree_model = DecisionTreeClassifier(random_state=RANDOM_STATE)
tree_model = GridSearchCV(tree_model, parameters)
tree_model.fit(X_train, y_train)

print("Best param :", tree_model.best_params_)
print("Accuracy :",tree_model.best_score_)

Best param : {'max_depth': 85}
Accuracy : 0.9792677371395623


In [55]:
parameters = {'max_depth' : [96, 97, 98, 99]}
tree_model = DecisionTreeClassifier(random_state=RANDOM_STATE)
tree_model = GridSearchCV(tree_model, parameters)
tree_model.fit(X_train, y_train)

print("Best param :", tree_model.best_params_)
print("Accuracy :",tree_model.best_score_)

Best param : {'max_depth': 96}
Accuracy : 0.9792079932022191


In [63]:
tree_model = DecisionTreeClassifier(random_state=RANDOM_STATE, max_depth = 96)
tree_model.fit(X_train, y_train)

pred = tree_model.predict(X_test)
accuracy_score(y_test, pred)

0.9780135742280852

In [58]:
dump(tree_model, "../models/tree_model.joblib");

In [75]:
size = os.path.getsize("../models/tree_model.joblib") / 1024 / 1024
print("File size:", round(size, 3), "mb")

File size: 3.04 mb


# Random Forest

In [None]:
parameters = { 
    'n_estimators': [, 7080, 90, 100],
    'max_depth' : [60, 70, 80, 90],
}

forest_model = RandomForestClassifier(random_state=RANDOM_STATE)
forest_model = GridSearchCV(forest_model, parameters)
forest_model.fit(X_train, y_train)

print("Best param :", forest_model.best_params_)
print("Accuracy :",forest_model.best_score_)

Best param : {'max_depth': 80, 'n_estimators': 80}
Accuracy : 0.9836942059103213


In [None]:
forest_model = RandomForestClassifier(random_state=RANDOM_STATE, max_depth= 80, n_estimators=80)
forest_model.fit(X_train, y_train)
pred = forest_model.predict(X_test)
accuracy_score(y_test, pred)

0.9827926837040342

In [None]:
dump(forest_model, "../models/forest_model.joblib");

In [74]:
size = os.path.getsize("../models/forest_model.joblib") / 1024 / 1024
print("File size:", round(size, 3), "mb")

File size: 320.455 mb


# Итог

Логистическая регрессия: точность 98%, вес < 1 мегабайт

Решающие деревья: точность 97%, вес 3 мегабайта

Логистическая регрессия: точность 98%, вес < 320 мегабайт

Мой выбор - лог регрессия