# Classification

In [None]:
import pandas as pd
import numpy as np

# Metrics and model evaluation
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV, learning_curve, validation_curve
from sklearn.pipeline import Pipeline

# Classifiers
import wittgenstein as lw
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

## Decision tree visualization
from IPython.display import Image 
import pydotplus

from sklearn.decomposition import PCA

# Visualization
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import plotly.figure_factory as ff
from IPython.display import display
pd.options.plotting.backend = "plotly"
pio.templates.default = "seaborn"

pd.set_option('display.max_columns', None)

## Utility functions

In [None]:
models = []

def get_grid_search(estimator, parameters, normalization=True):
  estimator.random_state = 42
  if normalization:
      pipeline = Pipeline(steps=[('scaler', MinMaxScaler()), ('clf', estimator)])
  else:
      pipeline = Pipeline(steps=[('clf', estimator)])

  return GridSearchCV(pipeline, param_grid=parameters, cv=5, n_jobs=-1, refit=True, return_train_score=True)

def print_grid_search_results(clf):
  df = pd.DataFrame(clf.cv_results_)[['params', 'mean_train_score', 'mean_test_score', 'rank_test_score']].sort_values(by='rank_test_score')
  display(df.head(10).style.set_caption('Top 10 Grid Search results').hide_index())

def report_scores(y_test, y_pred):
  print(classification_report(y_test, y_pred, target_names=classes, zero_division=0, sample_weight=None))

def plot_confusion_matrix(cm, classes, normalize=False):
  if normalize:
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print("Normalized confusion matrix")
  else:
    print('Confusion matrix, without normalization')

  # px.imshow(cm, x=classes, y=classes, title='Confusion matrix', color_continuous_scale="Blues", labels=dict(x="Real value", y="Predicted value", color="Records"), text_auto=True)
  fig = ff.create_annotated_heatmap(cm[[1, 0]], x=classes, y=classes, colorscale='Blues', showscale=True)
  fig.update_layout(xaxis = dict(title='Predicted value'), yaxis = dict(title='Real value'))
  fig.show()

def plot_learning_curve(clf, X, y, scoring='accuracy', cv=5, train_sizes=np.linspace(.1, 1.0, 5), 
                        shuffle=False, random_state=None):

    train_sizes, train_scores, test_scores = learning_curve(clf, X, y, train_sizes=train_sizes, cv=cv,
                                                            scoring=scoring, n_jobs=-1, shuffle=shuffle,
                                                            random_state=random_state)
    mean_train_score = np.mean(train_scores, axis=1)
    std_train_score = np.std(train_scores, axis=1)
    mean_test_score = np.mean(test_scores, axis=1)
    std_test_score = np.std(test_scores, axis=1)

    fig = go.Figure()

    fig.add_trace(go.Scatter(x=train_sizes, y=mean_train_score, name='train score', line=dict(color='royalblue')))
    fig.add_trace(go.Scatter(x=train_sizes, y=mean_train_score + std_train_score, mode="lines", showlegend=False, line=dict(width=0)))
    fig.add_trace(go.Scatter(x=train_sizes, y=mean_train_score - std_train_score, fill='tonexty', showlegend=False, 
      fillcolor='rgba(65,105,225,0.2)',
      line_color='rgba(255,255,255,0)',))

    fig.add_trace(go.Scatter(x=train_sizes, y=mean_test_score, name='cross-validation score', line=dict(color='firebrick')))
    fig.add_trace(go.Scatter(x=train_sizes, y=mean_test_score + std_test_score, mode="lines", showlegend=False, line=dict(width=0)))
    fig.add_trace(go.Scatter(x=train_sizes, y=mean_test_score - std_test_score, fill='tonexty', showlegend=False, 
      fillcolor='rgba(255,107,107,0.2)',
      line_color='rgba(255,255,255,0)',))

    model_name = str(clf.__class__.__name__)
    fig.update_layout(title=f'Learning Curve for {model_name}',
                   xaxis_title='Train set size',
                   yaxis_title='Accuracy')
    fig.show()

def plot_validation_curve(clf, X, y, parameters, validation_parameter, scoring='accuracy', cv=5):
  param_range = parameters[validation_parameter]
  train_scores, test_scores = validation_curve(clf, X, y, param_name=validation_parameter, param_range=param_range,
                                                 cv=cv, scoring=scoring, n_jobs=-1)
  mean_train_score = np.mean(train_scores, axis=1)
  std_train_score = np.std(train_scores, axis=1)
  mean_test_score = np.mean(test_scores, axis=1)
  std_test_score = np.std(test_scores, axis=1)

  fig = go.Figure()

  fig.add_trace(go.Scatter(x=param_range, y=mean_train_score, name='train score', line=dict(color='royalblue')))
  fig.add_trace(go.Scatter(x=param_range, y=mean_train_score + std_train_score, mode="lines", showlegend=False, line=dict(width=0)))
  fig.add_trace(go.Scatter(x=param_range, y=mean_train_score - std_train_score, fill='tonexty', showlegend=False, 
    fillcolor='rgba(65,105,225,0.2)',
    line_color='rgba(255,255,255,0)',))

  fig.add_trace(go.Scatter(x=param_range, y=mean_test_score, name='cross-validation score', line=dict(color='firebrick')))
  fig.add_trace(go.Scatter(x=param_range, y=mean_test_score + std_test_score, mode="lines", showlegend=False, line=dict(width=0)))
  fig.add_trace(go.Scatter(x=param_range, y=mean_test_score - std_test_score, fill='tonexty', showlegend=False, 
    fillcolor='rgba(255,107,107,0.2)',
    line_color='rgba(255,255,255,0)',))

  model_name = str(clf.__class__.__name__)
  fig.update_layout(title=f'Validation Curve for {model_name}',
                  xaxis_title=validation_parameter,
                  yaxis_title='Accuracy')
  fig.show()


def report(clf, X, Y, y_test, y_pred, model_classes, parameters=None, validation_parameter=None):
  print_grid_search_results(clf)
  cm = confusion_matrix(y_test, y_pred, labels=model_classes)
  plot_confusion_matrix(cm, classes=classes)
  report_scores(y_test, y_pred)
  if parameters and validation_parameter:
    plot_validation_curve(clf.best_estimator_, X, Y, parameters, validation_parameter)
  plot_learning_curve(clf.best_estimator_, X, Y)

def fit_and_report(name, estimator, parameters, X, Y, x_train, y_train, x_test, y_test, validation_parameter, normalization=True):
  # add clf to parameters
  parameters = {'clf__' + k: v for k, v in parameters.items()}
  validation_parameter = 'clf__' + validation_parameter

  clf = get_grid_search(estimator, parameters, normalization).fit(x_train, y_train)
  y_pred = clf.predict(x_test)
  models.append((name, clf.best_estimator_))
  report(clf, X, Y, y_test, y_pred, clf.classes_, parameters, validation_parameter)
  return clf.best_estimator_

## Pre-processing

In [None]:
df_players_complete = pd.read_csv("./datasets/players.csv", index_col=0)

### Label computation

#### Median splitting

In [None]:
df_players_complete['is_high_ranked'] = np.digitize(df_players_complete['mean_rank_points'], bins=[df_players_complete['mean_rank_points'].median()])
print(df_players_complete['is_high_ranked'].value_counts())
px.histogram(df_players_complete, x="mean_rank_points", color="is_high_ranked", title="Histogram of mean rank points")

#### Pareto splitting

In [None]:
# lowest 80% of mean rank points are considered low ranked
df_players_complete['is_high_ranked'] = np.digitize(df_players_complete['mean_rank_points'], bins=[df_players_complete['mean_rank_points'].quantile(0.8)])
print(df_players_complete['is_high_ranked'].value_counts())
px.histogram(df_players_complete, x="mean_rank_points", color="is_high_ranked", title="Histogram of mean rank points")

#### K-means

In [None]:
# Apply k-means to identify cluster of good and bad players
from sklearn.cluster import KMeans

feautures = ['max_tourney_revenue', 'mean_rank_points', 'lrpOnMxrp', 'matches_won_ratio']
df_data = df_players_complete[feautures].reset_index(drop=True)
#df_data = pd.DataFrame(MinMaxScaler().fit_transform(df_data), columns=df_data.columns)
#df_data = df_data.round(3)
kmeans = KMeans(n_clusters=2, n_init=10, max_iter=100, init="k-means++", random_state=42).fit(df_data)
df_players_complete['cluster'] = kmeans.labels_

# is high ranked based on cluster results
df_players_complete['is_high_ranked'] = df_players_complete['cluster']
# print number of players
print(df_players_complete['is_high_ranked'].value_counts())
px.histogram(df_players_complete, x="mean_rank_points", color="is_high_ranked", title="Histogram of mean rank points")

#### Choice of the label to use

In [None]:
classes = ['low_ranked', 'high_ranked']
df_players = df_players_complete.copy()
df_players['is_high_ranked'] = np.digitize(df_players['mean_rank_points'], bins=[df_players['mean_rank_points'].quantile(0.8)])
df_players[['mean_rank_points', 'is_high_ranked']]
df_players.drop(columns=['mean_rank_points'], inplace=True)
df_players

In [None]:
df_players.is_high_ranked.value_counts().plot(kind='bar')

### Feature choice

In [None]:
# per il momento, gli stessi attributi utilizzati per il clustering REM
# problema: la feature mean_rank_points, da utilizzare come label, era già presente tra le features utilizzate per il clustering
# df_players = df_players_complete[['max_tourney_revenue', 'lrpOnMxrp', 'matches_won_ratio', 'mean_rank_points']]

# select all numerical features
features_to_drop = ['ht', 'max_rank_points', 'variance_rank_points', 'max_tourney_spectators', 'max_tourney_revenue', 'last_rank_points'] 
df_players = df_players.dropna(subset=['rel_bpSaved']).drop(features_to_drop,axis=1)
df_players = df_players.select_dtypes(include=['int64', 'float64'])

In [None]:
df_players.info()

### Splitting into train and test

In [None]:
#df_players_norm = pd.DataFrame(MinMaxScaler().fit_transform(df_players), columns=df_players.columns)

X = df_players.drop(columns=['is_high_ranked'])
Y = df_players['is_high_ranked']
x_train, x_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=0.3, random_state=0)

## Classification

### Decision Tree

In [None]:
parameters = {'max_depth': [2,3,4,5,6,7,8,9,10], 'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random']}
best_model = fit_and_report("Decision Tree", DecisionTreeClassifier(), parameters, X, Y, x_train, y_train, x_test, y_test, 'max_depth', normalization=False)

#### Explanation

In [None]:
cdot_data = tree.export_graphviz(best_model["clf"], out_file=None,
                         feature_names=list(x_train.columns),
                         class_names=classes,
                         filled=True, rounded=True)
graph = pydotplus.graph_from_dot_data(cdot_data)
Image(graph.create_png())

#TODO: problema, cos'è il rettangolo nero?

#### Feature importance

In [None]:
# get the names of the most important features according to the model
# map the feature importances to the feature names
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': best_model["clf"].feature_importances_})
# sort according to the importance
feature_importances = feature_importances.sort_values('importance', ascending=False).reset_index(drop=True)
feature_importances

### Rule based (RIPPER)

In [None]:
# parameters = {"prune_size": [0.33, 0.5], "k": [1, 2]}
# clf = get_grid_search(lw.RIPPER(), parameters).fit(x_train, y_train, pos_class=1)
# y_pred = clf.predict(x_test)

# models.append(("Rule based", clf.best_estimator_))
# report(clf, X, Y, y_test, y_pred, clf.classes_)

In [None]:
# clf.best_estimator_.out_model()

### Random forest

In [None]:
parameters = {'max_depth': [2,3,4,5,6,7,8,9,10], 'n_estimators': [10, 20, 50, 100]}
best_model = fit_and_report("Random Forest", RandomForestClassifier(), parameters, X, Y, x_train, y_train, x_test, y_test, 'max_depth', normalization=False)

### KNN

In [None]:
# Use SQRT heuristic on train set to find the optimal K
k_euristic = int(np.sqrt(len(x_train))) + 1
k_range = list(range(1, k_euristic, 2))

parameters = {'n_neighbors': k_range, 'weights':['uniform', 'distance'], 'algorithm': ['ball_tree', 'kd_tree', 'brute'], 'metric': ['euclidean', 'manhattan', 'minkowski']}
best_model = fit_and_report("KNN", KNeighborsClassifier(), parameters, X, Y, x_train, y_train, x_test, y_test, 'n_neighbors', normalization=True)

### Naive Bayes

In [None]:
parameters = {}
best_model = fit_and_report("Naive Bayes", GaussianNB(), parameters, X, Y, x_train, y_train, x_test, y_test, '', normalization=True)

### Support Vector Machine

In [None]:
parameters = {'C': [0.1, 1, 10, 100, 1000], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
best_model = fit_and_report("SVM", SVC(probability=True), parameters, X, Y, x_train, y_train, x_test, y_test, 'C', normalization=True)

### Neural Networks

In [None]:
parameters = {"solver": ['lbfgs', 'sgd', 'adam'], "alpha": [0.0001, 0.001, 0.01], "hidden_layer_sizes": [(10,), (20,)], "activation": ['identity', 'logistic', 'tanh', 'relu'], "learning_rate": ['constant', 'invscaling', 'adaptive']}
best_model = fit_and_report("Neural Network", MLPClassifier(), parameters, X, Y, x_train, y_train, x_test, y_test, 'alpha', normalization=True)

## Result analysis

### Comparison (ROC)

In [None]:
fig = go.Figure()
fig.add_shape(type="line", x0=0, y0=0, x1=1, y1=1, line=dict(color="RoyalBlue",width=3, dash="dash"))

for i in range(len(models)):
    y_score = models[i][1].predict_proba(x_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_score)
    auc_score = roc_auc_score(y_test, y_score)
    models[i] += (auc_score,)
    
# Sort according to AUC score
models.sort(key=lambda x: x[2], reverse=True)  
for model in models:
    y_score = model[1].predict_proba(x_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_score)
    name = f"{model[0]} - AUC={model[2]:.3f}"
    fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines'))

fig.update_layout(
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    yaxis=dict(scaleanchor="x", scaleratio=1),
    xaxis=dict(constrain='domain'),
    width=700, height=500,
    autosize=True
)
fig.show()

### Visualization of the best model

In [None]:
# search for the model with the best AUC score
best_model = max(models, key=lambda x: x[2])

# PCA
X_r = pd.DataFrame(PCA(n_components=2).fit_transform(x_train))
prediction = best_model[1].predict_proba(x_train)
fig = px.scatter(x=X_r[0], y=X_r[1], color=prediction[:, 1], color_continuous_scale='RdBu', symbol=y_train, symbol_map={'0': 'square-dot', '1': 'circle-dot'},  labels={'symbol': 'label', 'color': 'score of <br>first class'})
fig.update_traces(marker_size=12, marker_line_width=1.5)
fig.update_layout(title=f"PCA visualization for {best_model[0]}", legend_orientation='h')
fig.show()