# Classification

In [None]:
import pandas as pd
import numpy as np

# Metrics and model evaluation
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV, learning_curve

# Classifiers
import wittgenstein as lw
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

## Decision tree visualization
from IPython.display import Image 
import pydotplus

# Visualization
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import plotly.figure_factory as ff
from IPython.display import display
pd.options.plotting.backend = "plotly"
pio.templates.default = "seaborn"

pd.set_option('display.max_columns', None)

## Utility functions
- STANDARDIZZARE 
  - MODELLI (neural networks)
  - METHOD EVALUATION WITH OVERSAMPLING

In [None]:
models = []

def get_grid_search(estimator, parameters):
  return GridSearchCV(estimator, param_grid=parameters, cv=5, n_jobs=-1, refit=True, return_train_score=True)

# pretty printing of metrics computed on test set
def report_scores(y_test, y_pred):
  print(classification_report(y_test, y_pred, target_names=classes, zero_division=0, sample_weight=None))
  # train_pred_dt = dt.predict(x_train)
  # y_pred = dt.predict(x_test)
  # print('Accuracy training set ', metrics.accuracy_score(y_train, train_pred_dt))
  # print('Accuracy test set ', metrics.accuracy_score(y_test, y_pred))
  # print('Precision training set ', metrics.precision_score(y_train, train_pred_dt, average='weighted'))
  # print('Precision test set ', metrics.precision_score(y_test, y_pred, average='weighted'))
  # print('Recall training set ', metrics.recall_score(y_train, train_pred_dt, average='weighted'))
  # print('F1 score trainig set ', metrics.f1_score(y_train, train_pred_dt, average='weighted'))
  # print('Support training set ', metrics.precision_recall_fscore_support(y_train, train_pred_dt))

def plot_confusion_matrix(cm, classes, normalize=False):
  if normalize:
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print("Normalized confusion matrix")
  else:
    print('Confusion matrix, without normalization')

  # px.imshow(cm, x=classes, y=classes, title='Confusion matrix', color_continuous_scale="Blues", labels=dict(x="Real value", y="Predicted value", color="Records"), text_auto=True)
  fig = ff.create_annotated_heatmap(cm[[1, 0]], x=classes, y=classes, colorscale='Blues', showscale=True)
  fig.update_layout(xaxis = dict(title='Predicted value'), yaxis = dict(title='Real value'))
  fig.show()


def plot_learning_curve(clf, X, y, scorer='accuracy', cv=5, train_sizes=np.linspace(.1, 1.0, 5), 
                        shuffle=False, random_state=None):

    train_sizes, train_scores, test_scores = learning_curve(clf, X, y, train_sizes=train_sizes, cv=cv,
                                                            scoring=scorer, n_jobs=-1, shuffle=shuffle,
                                                            random_state=random_state)

    mean_train_score = np.mean(train_scores, axis=1)
    std_train_score = np.std(train_scores, axis=1)
    mean_test_score = np.mean(test_scores, axis=1)
    std_test_score = np.std(test_scores, axis=1)

    fig = go.Figure()

    fig.add_trace(go.Scatter(x=train_sizes, y=mean_train_score, name='train score', line=dict(color='royalblue')))
    fig.add_trace(go.Scatter(x=train_sizes, y=mean_train_score + std_train_score, mode="lines", showlegend=False, line=dict(width=0)))
    fig.add_trace(go.Scatter(x=train_sizes, y=mean_train_score - std_train_score, fill='tonexty', showlegend=False, 
      fillcolor='rgba(65,105,225,0.2)',
      line_color='rgba(255,255,255,0)',))

    fig.add_trace(go.Scatter(x=train_sizes, y=mean_test_score, name='cross-validation score', line=dict(color='firebrick')))
    fig.add_trace(go.Scatter(x=train_sizes, y=mean_test_score + std_test_score, mode="lines", showlegend=False, line=dict(width=0)))
    fig.add_trace(go.Scatter(x=train_sizes, y=mean_test_score - std_test_score, fill='tonexty', showlegend=False, 
      fillcolor='rgba(255,107,107,0.2)',
      line_color='rgba(255,255,255,0)',))

    model_name = str(clf.__class__.__name__)
    fig.update_layout(title=f'Learning Curve for {model_name}',
                   xaxis_title='Train set size',
                   yaxis_title='Accuracy')
    fig.show()

def print_grid_search_results(clf):
  df = pd.DataFrame(clf.cv_results_)[['params', 'mean_train_score', 'mean_test_score', 'rank_test_score']].sort_values(by='rank_test_score')
  display(df.head(10).style.set_caption('Top 10 Grid Search results').hide_index())

def report(clf, X, Y, y_test, y_pred, model_classes):
  print_grid_search_results(clf)
  cm = confusion_matrix(y_test, y_pred, labels=model_classes)
  plot_confusion_matrix(cm, classes=classes)
  report_scores(y_test, y_pred)
  plot_learning_curve(clf.best_estimator_, X, Y)

## Pre-processing

In [None]:
df_players_complete = pd.read_csv("./datasets/players.csv", index_col=0)

### Features choice

In [None]:
# per il momento, gli stessi attributi utilizzati per il clustering REM
# problema: la feature mean_rank_points, da utilizzare come label, era già presente tra le features utilizzate per il clustering
df_players = df_players_complete[['max_tourney_revenue', 'lrpOnMxrp', 'matches_won_ratio', 'mean_rank_points']]

### Label computation

In [None]:
classes = ['low_ranked', 'high_ranked']
df_players['is_high_ranked'] = np.digitize(df_players['mean_rank_points'], bins=[df_players['mean_rank_points'].median()])
df_players[['mean_rank_points', 'is_high_ranked']]
df_players.drop(columns=['mean_rank_points'], inplace=True)
df_players

In [None]:
df_players.is_high_ranked.value_counts().plot(kind='bar')

### Splitting

In [None]:
df_players_norm = pd.DataFrame(MinMaxScaler().fit_transform(df_players), columns=df_players.columns)

X = df_players_norm.drop(columns=['is_high_ranked'])
Y = df_players_norm['is_high_ranked']
x_train, x_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=0.3, random_state=0)

## Classification

### Decision Tree

In [None]:
parameters = {'max_depth': [2,3,4], 'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random']}
clf = get_grid_search(DecisionTreeClassifier(), parameters).fit(x_train, y_train)
y_pred = clf.predict(x_test)

models.append(("Decision Tree", clf.best_estimator_))
report(clf, X, Y, y_test, y_pred, clf.classes_)

#### Explanation

In [None]:
cdot_data = tree.export_graphviz(clf.best_estimator_, out_file=None,
                         feature_names=list(x_train.columns),
                         class_names=classes,
                         filled=True, rounded=True)
graph = pydotplus.graph_from_dot_data(cdot_data)
Image(graph.create_png())

#TODO: problema, cos'è il rettangolo nero?

## Rule based (RIPPER)

In [None]:
parameters = {"prune_size": [0.33, 0.5], "k": [1, 2]}
clf = get_grid_search(lw.RIPPER(), parameters).fit(x_train, y_train, pos_class=1)
y_pred = clf.predict(x_test)

models.append(("Rule based", clf.best_estimator_))
report(clf, X, Y, y_test, y_pred, clf.classes_)

In [None]:
clf.best_estimator_.out_model()

## Random forest

In [None]:
parameters = {'max_depth': [2,3,4], 'n_estimators': [10, 20, 50, 100]}
clf = get_grid_search(RandomForestClassifier(), parameters).fit(x_train, y_train)
y_pred = clf.predict(x_test)

models.append(("Random Forest", clf.best_estimator_))
report(clf, X, Y, y_test, y_pred, clf.classes_)

### KNN

In [None]:
parameters = {'n_neighbors': [2,3,4,5,6,7,8,9,10], 'weights':['uniform', 'distance'], 'algorithm': ['ball_tree', 'kd_tree', 'brute'], 'metric': ['euclidean', 'manhattan', 'minkowski']}
clf = get_grid_search(KNeighborsClassifier(), parameters).fit(x_train, y_train)
y_pred = clf.predict(x_test.values)

models.append(("KNN", clf.best_estimator_))
report(clf, X, Y, y_test.values, y_pred, clf.classes_)

## Naive Bayes

In [None]:
parameters = {}
clf = get_grid_search(GaussianNB(), parameters).fit(x_train, y_train)
y_pred = clf.predict(x_test)

models.append(("Naive Bayes", clf.best_estimator_))
report(clf, X, Y, y_test, y_pred, clf.classes_)

## Support Vector Machine

In [None]:
parameters = {'C': [0.1, 1, 10, 100, 1000], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
clf = get_grid_search(SVC(probability=True), parameters).fit(x_train, y_train)
y_pred = clf.predict(x_test)

models.append(("SVM", clf.best_estimator_))
report(clf, X, Y, y_test, y_pred, clf.classes_)

## Neural Networks

In [None]:
from sklearn.neural_network import MLPClassifier

parameters = {"solver": ['lbfgs', 'sgd', 'adam'], "alpha": [0.0001, 0.001, 0.01], "hidden_layer_sizes": [(10,), (20,)], "activation": ['identity', 'logistic', 'tanh', 'relu'], "learning_rate": ['constant', 'invscaling', 'adaptive']}
clf = get_grid_search(MLPClassifier(), parameters).fit(x_train, y_train)
y_pred = clf.predict(x_test)

models.append(("Neural Network", clf.best_estimator_))
report(clf, X, Y, y_test, y_pred, clf.classes_)


## Comparison (ROC)

In [None]:
fig = go.Figure()
fig.add_shape(type="line", x0=0, y0=0, x1=1, y1=1, line=dict(color="RoyalBlue",width=3, dash="dash"))

for i in range(len(models)):
    y_score = models[i][1].predict_proba(x_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_score)
    auc_score = roc_auc_score(y_test, y_score)
    models[i] += (auc_score,)
    
# Sort according to AUC score
models.sort(key=lambda x: x[2], reverse=True)  
for model in models:
    y_score = model[1].predict_proba(x_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_score)
    name = f"{model[0]} - AUC={model[2]:.3f}"
    fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines'))

fig.update_layout(
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    yaxis=dict(scaleanchor="x", scaleratio=1),
    xaxis=dict(constrain='domain'),
    width=700, height=500,
    autosize=True
)
fig.show()