# Classification

TODO: riguardare i testi indicati con il tag TEMP<br>
TODO: rimuovere i testi con il tag REM

In [None]:
import pandas as pd
import numpy as np

# Metrics and model evaluation
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV

# Classifiers
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

## Decision tree visualization
from IPython.display import Image 
import pydotplus

# Visualization
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import plotly.figure_factory as ff
pd.options.plotting.backend = "plotly"
pio.templates.default = "seaborn"

pd.set_option('display.max_columns', None)

## Utility functions
- STANDARDIZZARE 
  - classification report
  - LEARNING CURVE
  - MODELLI
  - METHOD EVALUATION WITH OVERSAMPLING

In [None]:
models = []

# pretty printing of metrics computed on test set
def report_scores(y_test, y_pred):
  print(classification_report(y_test, y_pred, target_names=classes, zero_division=0, sample_weight=None))
  # train_pred_dt = dt.predict(x_train)
  # y_pred_dt = dt.predict(x_test)
  # print('Accuracy training set ', metrics.accuracy_score(y_train, train_pred_dt))
  # print('Accuracy test set ', metrics.accuracy_score(y_test, y_pred_dt))
  # print('Precision training set ', metrics.precision_score(y_train, train_pred_dt, average='weighted'))
  # print('Precision test set ', metrics.precision_score(y_test, y_pred_dt, average='weighted'))
  # print('Recall training set ', metrics.recall_score(y_train, train_pred_dt, average='weighted'))
  # print('F1 score trainig set ', metrics.f1_score(y_train, train_pred_dt, average='weighted'))
  # print('Support training set ', metrics.precision_recall_fscore_support(y_train, train_pred_dt))

def plot_confusion_matrix(cm, classes, normalize=False):
  if normalize:
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print("Normalized confusion matrix")
  else:
    print('Confusion matrix, without normalization')

  # px.imshow(cm, x=classes, y=classes, title='Confusion matrix', color_continuous_scale="Blues", labels=dict(x="Real value", y="Predicted value", color="Records"), text_auto=True)
  fig = ff.create_annotated_heatmap(cm[[1, 0]], x=classes, y=classes, colorscale='Blues', showscale=True)
  fig.update_layout(xaxis = dict(title='Predicted value'), yaxis = dict(title='Real value'))
  fig.show()

def report(y_test, y_pred, model_classes):
  cm = confusion_matrix(y_test, y_pred, labels=model_classes)
  plot_confusion_matrix(cm, classes=classes)
  report_scores(y_test, y_pred)

## Pre-processing

In [None]:
df_players_complete = pd.read_csv("./datasets/players.csv", index_col=0)

### Features choice

In [None]:
# per il momento, gli stessi attributi utilizzati per il clustering REM
# problema: la feature mean_rank_points, da utilizzare come label, era già presente tra le features utilizzate per il clustering
df_players = df_players_complete[['max_tourney_revenue', 'lrpOnMxrp', 'matches_won_ratio', 'mean_rank_points']]

### Label computation

In [None]:
classes = ['low_ranked', 'high_ranked']
df_players['is_high_ranked'] = np.digitize(df_players['mean_rank_points'], bins=[df_players['mean_rank_points'].median()])
df_players[['mean_rank_points', 'is_high_ranked']]
df_players.drop(columns=['mean_rank_points'], inplace=True)
df_players

In [None]:
df_players.is_high_ranked.value_counts().plot(kind='bar')

### Splitting

In [None]:
df_players_norm = pd.DataFrame(MinMaxScaler().fit_transform(df_players), columns=df_players.columns)

X = df_players_norm.drop(columns=['is_high_ranked'])
Y = df_players_norm['is_high_ranked']
x_train, x_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=0.3, random_state=0)

## Classification

### Decision Tree

In [None]:
parameters = {'criterion':['gini','entropy'],'max_depth':[2, 3, 4]}
clf = GridSearchCV(tree.DecisionTreeClassifier(), parameters, cv=5)
clf.fit(x_train, y_train)
pd.DataFrame(clf.cv_results_)[['params', 'mean_test_score']].sort_values('mean_test_score', ascending = False)

#TODO: REM clf.best_score_, clf.best_params_

dt = DecisionTreeClassifier(criterion=clf.best_params_['criterion'], splitter='best', max_depth=clf.best_params_['max_depth'],
 min_samples_split=3, min_samples_leaf=8)
dt.fit(x_train, y_train)
y_pred_dt = dt.predict(x_test)

models.append(("Decision Tree", dt))
report(y_test, y_pred_dt, dt.classes_)

#### Explanation

In [None]:
cdot_data = tree.export_graphviz(dt, out_file=None,
                         feature_names=list(x_train.columns),
                         class_names=classes,
                         filled=True, rounded=True)
graph = pydotplus.graph_from_dot_data(cdot_data)
Image(graph.create_png())

#TODO: problema, cos'è il rettangolo nero?

## Random forest

In [None]:
rf = RandomForestClassifier(n_estimators=100, max_depth=3, min_samples_split=3, min_samples_leaf=8)
rf.fit(x_train, y_train)
y_pred_rf = rf.predict(x_test)

models.append(("Random Forest", rf))
report(y_test, y_pred_rf, rf.classes_)

### KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', metric='minkowski')
knn.fit(x_train, y_train)
y_pred_knn = knn.predict(x_test)

models.append(("KNN", knn))
report(y_test, y_pred_knn, knn.classes_)

## Naive Bayes

In [None]:
gnb = GaussianNB()
gnb.fit(x_train, y_train)
y_pred_gnb = gnb.predict(x_test)

models.append(("Gaussian Naive Bayes", gnb))
report(y_test, y_pred_gnb, gnb.classes_)

## Support Vector Machine

In [None]:
svc = SVC(kernel='sigmoid', C=0.8, gamma='scale', probability=True)
svc.fit(x_train, y_train)
y_pred_svc = svc.predict(x_test)

models.append(("SVM", svc))
report(y_test, y_pred_gnb, svc.classes_)

## Neural Networks

## Comparison (ROC)

In [None]:
fig = go.Figure()
fig.add_shape(type="line", x0=0, y0=0, x1=1, y1=1, line=dict(color="RoyalBlue",width=3, dash="dash"))

for i in range(len(models)):
    y_score = models[i][1].predict_proba(x_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_score)
    auc_score = roc_auc_score(y_test, y_score)
    models[i] += (auc_score,)
    
# Sort according to AUC score
models.sort(key=lambda x: x[2], reverse=True)  
for model in models:
    y_score = model[1].predict_proba(x_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_score)
    name = f"{model[0]} - AUC={model[2]:.3f}"
    fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines'))

fig.update_layout(
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    yaxis=dict(scaleanchor="x", scaleratio=1),
    xaxis=dict(constrain='domain'),
    width=700, height=500
)
fig.show()