# Classification

TODO: riguardare i testi indicati con il tag TEMP<br>
TODO: rimuovere i testi con il tag REM

In [None]:
import pandas as pd
import numpy as np

# Metrics and model evaluation
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV

# Classifiers
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier

# Visualization
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import plotly.figure_factory as ff
pd.options.plotting.backend = "plotly"
pio.templates.default = "seaborn"

pd.set_option('display.max_columns', None)

## Utility functions
- STANDARDIZZARE 
  - classification report
  - ROC CURVE
  - LEARNING CURVE
  - MODELLI
  - METHOD EVALUATION WITH OVERSAMPLING

In [None]:
# pretty printing of metrics computed on test set
def report_scores(test_label, test_pred):
  print(classification_report(test_label, test_pred, target_names=classes, zero_division=0, sample_weight=None))
  # train_pred_dt = dt.predict(train_set)
  # test_pred_dt = dt.predict(test_set)
  # print('Accuracy training set ', metrics.accuracy_score(train_label, train_pred_dt))
  # print('Accuracy test set ', metrics.accuracy_score(test_label, test_pred_dt))
  # print('Precision training set ', metrics.precision_score(train_label, train_pred_dt, average='weighted'))
  # print('Precision test set ', metrics.precision_score(test_label, test_pred_dt, average='weighted'))
  # print('Recall training set ', metrics.recall_score(train_label, train_pred_dt, average='weighted'))
  # print('F1 score trainig set ', metrics.f1_score(train_label, train_pred_dt, average='weighted'))
  # print('Support training set ', metrics.precision_recall_fscore_support(train_label, train_pred_dt))

def plot_confusion_matrix(cm, classes, normalize=False):
  if normalize:
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print("Normalized confusion matrix")
  else:
    print('Confusion matrix, without normalization')

  # px.imshow(cm, x=classes, y=classes, title='Confusion matrix', color_continuous_scale="Blues", labels=dict(x="Real value", y="Predicted value", color="Records"), text_auto=True)
  fig = ff.create_annotated_heatmap(cm[[1, 0]], x=classes, y=classes, colorscale='Blues', showscale=True)
  fig.update_layout(xaxis = dict(title='Predicted value'), yaxis = dict(title='Real value'))
  fig.show()

models = {}

## Pre-processing

In [None]:
df_players_complete = pd.read_csv("./datasets/players.csv", index_col=0)

### Features choice

In [None]:
# per il momento, gli stessi attributi utilizzati per il clustering REM
# problema: la feature mean_rank_points, da utilizzare come label, era già presente tra le features utilizzate per il clustering
df_players = df_players_complete[['max_tourney_revenue', 'lrpOnMxrp', 'matches_won_ratio', 'mean_rank_points']]

### Label computation

In [None]:
df_players['is_high_ranked'] = np.digitize(df_players['mean_rank_points'], bins=[df_players['mean_rank_points'].median()])

In [None]:
df_players[['mean_rank_points', 'is_high_ranked']]

In [None]:
df_players.drop(columns=['mean_rank_points'], inplace=True)
df_players

In [None]:
df_players.is_high_ranked.value_counts().plot(kind='bar')

### Splitting

In [None]:
df_players_norm = pd.DataFrame(MinMaxScaler().fit_transform(df_players), columns=df_players.columns)

X = df_players_norm.drop(columns=['is_high_ranked'])
Y = df_players_norm['is_high_ranked']
train_set, test_set, train_label, test_label = train_test_split(X, Y, stratify=Y, test_size=0.3, random_state=0)

## Classification

TODO: REM da questo punto in poi si tratta di copia e incolla adattati (dal progetto di Pasquali ecc.)

### Decision Tree

In [None]:
parameters = {'criterion':['gini','entropy'],'max_depth':[2, 3, 4]}
clf = GridSearchCV(tree.DecisionTreeClassifier(), parameters, cv=5)
clf.fit(train_set, train_label)
pd.DataFrame(clf.cv_results_)[['params', 'mean_test_score']].sort_values('mean_test_score', ascending = False)

#TODO: REM clf.best_score_, clf.best_params_

In [None]:
dt = tree.DecisionTreeClassifier(criterion=clf.best_params_['criterion'], splitter='best', 
                                  max_depth=clf.best_params_['max_depth'], 
                                  min_samples_split=3, min_samples_leaf=8)
dt = dt.fit(train_set, train_label)
models['Decision Tree'] = dt

In [None]:
from IPython.display import Image 
import pydotplus

classes = ['low_ranked', 'high_ranked']
dot_data = tree.export_graphviz(dt, out_file=None,
                         feature_names=list(train_set.columns),
                         class_names=classes,
                         filled=True, rounded=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())

#TODO: problema, cos'è il rettangolo nero?

In [None]:
test_pred_dt = dt.predict(test_set)
report_scores(test_label, test_pred_dt)
cm = confusion_matrix(test_label, test_pred_dt, labels=dt.classes_)
plot_confusion_matrix(cm, classes=classes)

### KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', metric='minkowski').fit(train_set, train_label)

test_pred_knn = knn.predict(test_set)
report_scores(test_label, test_pred_knn)
cm = confusion_matrix(test_label, test_pred_knn, labels=dt.classes_)
plot_confusion_matrix(cm, classes=classes)
models['KNN'] = knn

In [None]:
import plotly.express as px
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc
from sklearn.datasets import make_classification

models = []

model = LogisticRegression()
model.fit(test_set, test_label)

y_score = model.predict_proba(test_set)[:, 1]

fpr, tpr, thresholds = roc_curve(test_label, y_score)

fig = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()

## Comparison (ROC)

In [None]:
fig = go.Figure()
fig.add_shape(type="line", x0=0, y0=0, x1=1, y1=1, line=dict(color="RoyalBlue",width=3, dash="dash"))

for model_name, model in models.items():
    y_score = model.predict_proba(test_set)[:, 1]
    fpr, tpr, thresholds = roc_curve(test_label, y_score)
    auc_score = roc_auc_score(test_label, y_score)
    name = f"{model_name} (AUC={auc_score:.3f})"
    fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines'))

fig.update_layout(
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    yaxis=dict(scaleanchor="x", scaleratio=1),
    xaxis=dict(constrain='domain'),
    width=700, height=500
)
fig.show()