# Classification

In [None]:
import pandas as pd
import numpy as np

# Metrics and model evaluation
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, f1_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, learning_curve, validation_curve, cross_validate
from sklearn.pipeline import Pipeline

# Classifiers
import wittgenstein as lw
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# Visualization
from IPython.display import Image, Markdown, display
from sklearn.decomposition import PCA
import pydotplus

import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import plotly.figure_factory as ff
pd.options.plotting.backend = "plotly"
pio.templates.default = "seaborn"

pd.set_option('display.max_columns', None)

import warnings
def warn(*args, **kwargs):
    pass
warnings.warn = warn

## Helper class

This class is used to split the data into train and test and then is used to show a report for each classifier, that contains info about the validation and the test.

In [None]:
class Classification:
  
  # constructors that takes models and data
  def __init__(self, X, Y, classes, cv=5):
    self._models = []
    self.X = X
    self.Y = Y
    self.classes = classes
    # Split the data into training and test sets with a test size of 30%
    self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(X, Y, stratify=Y, test_size=0.3, random_state=42)
    self._cv = cv

  # This function executes a grid search for a specific model and then saves the best model and its parameters
  def show_report(self, estimator_name, estimator, parameters, validation_parameter, normalization=True, best_model_position=1):
    # add clf to parameters
    parameters = {'clf__' + k: v for k, v in parameters.items()}
    validation_parameter = 'clf__' + validation_parameter

    # Execute grid search for the classifier given the parameters on the training set with 5-fold cross validation
    grid_search_clf = self._estimator_to_grid_search(estimator, parameters, cross_validation=self._cv, normalization=normalization)
    grid_search_clf.fit(self.x_train, self.y_train)
    
    # ========================
    # Report on VALIDATION SET
    # ========================
    display(Markdown('#### VALIDATION Report'))
    ## Show grid search results with the best top 10 results
    display(Markdown(f'##### Top 10 results of the grid search with {self._cv}-fold cross validation'))
    self._print_grid_search_results(grid_search_clf)

    # If selected rank for the best model is different, pick the best model with the selected rank
    if best_model_position != 1:
      results = pd.DataFrame(grid_search_clf.cv_results_)[['params', 'mean_train_score', 'mean_test_score', 'rank_test_score']].sort_values(by='rank_test_score')
      params = results.iloc[[best_model_position - 1], 0].values[0]
      grid_search_clf.best_estimator_.set_params(**params).fit(self.x_train, self.y_train)

    # Print Validation metrics for the best model
    validation_scores = cross_validate(grid_search_clf.best_estimator_, self.x_train, self.y_train, cv=self._cv, scoring=["accuracy", "f1", "precision", "recall"])
    validation_scores = {k: round(np.mean(v), 4) for k, v in validation_scores.items()}
    display(Markdown(f'##### Validation metrics for the best model'))
    print("Accuracy: " + str(validation_scores['test_accuracy']))
    print("F1: " + str(validation_scores['test_f1']))
    print("Precision: " + str(validation_scores['test_precision']))
    print("Recall: " + str(validation_scores['test_recall']))

    # Plot the validation curve
    if parameters and validation_parameter:
      self._plot_validation_curve(grid_search_clf.best_estimator_, parameters, validation_parameter, cv=self._cv)

    # Plot the learning curve
    self._plot_learning_curve(grid_search_clf.best_estimator_, cv=self._cv)

    # ========================
    # Report on TEST SET
    # ========================
    display(Markdown('#### TEST Report'))
    # Test the best classifier on the test set
    y_pred = grid_search_clf.predict(self.x_test)

    # Get the best classifier of the grid search and add it to the list of best models
    self._models.append((estimator_name, grid_search_clf.best_estimator_))

    # Show the confusion matrix for the best classifier on the TEST set
    self._plot_confusion_matrix(y_pred, labels=grid_search_clf.classes_)

    ## Show the metrics for the best classifier on the TEST set
    display(Markdown('##### Metrics for the best classifier on the TEST set'))
    print(classification_report(self.y_test, y_pred, target_names=self.classes, zero_division=0, sample_weight=None))

    # Return the best classifier
    return grid_search_clf.best_estimator_

  # Return all best models tested with show_report
  def get_best_models(self):
    return self._models

  def _estimator_to_grid_search(self, estimator, parameters, cross_validation=5, normalization=True, random_state=42):
    estimator.random_state = random_state
    if normalization:
        pipeline = Pipeline(steps=[('scaler', MinMaxScaler()), ('clf', estimator)])
    else:
        pipeline = Pipeline(steps=[('clf', estimator)])

    return GridSearchCV(pipeline, param_grid=parameters, cv=cross_validation, n_jobs=-1, refit=True, return_train_score=True)

  def _print_grid_search_results(self, grid_search):
    df = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_train_score', 'mean_test_score', 'rank_test_score']].sort_values(by='rank_test_score')

    # remove the 'clf__' prefix from the parameters column
    df['params'] = df['params'].apply(lambda x: {k[5:]: v for k, v in x.items()})
    # create a column for each parameter in params column in df_new
    df_new = pd.DataFrame()
    for param in df['params'].iloc[0].keys():
      df_new[param] = df['params'].apply(lambda x: x[param])
    # add old columns to new df
    df_new = df_new.join(df[['mean_train_score', 'mean_test_score', 'rank_test_score']])

    display(df_new.head(10).style.hide_index())

  def _plot_confusion_matrix(self, y_pred, labels, normalize=False):
    cm = confusion_matrix(self.y_test, y_pred, labels=labels)
    if normalize:
      cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
      display(Markdown('##### Normalized confusion matrix for the best classifier obtained by the Grid Search on the TEST set'))
    else:
      display(Markdown('##### Confusion matrix for the best classifier obtained by the Grid Search on the TEST set'))

    # px.imshow(cm, x=classes, y=classes, title='Confusion matrix', color_continuous_scale="Blues", labels=dict(x="Real value", y="Predicted value", color="Records"), text_auto=True)
    # invert self.classes
    fig = ff.create_annotated_heatmap(cm[[1, 0]], x=self.classes, y=self.classes[::-1], colorscale='Blues', showscale=True)
    fig.update_layout(xaxis = dict(title='Predicted value'), yaxis = dict(title='Real value'))
    fig.show()

  def _plot_learning_curve(self, clf, scoring='accuracy', cv=5, train_sizes=np.linspace(.1, 1.0, 5), shuffle=False, random_state=None):

    train_sizes, train_scores, test_scores = learning_curve(clf, self.x_train, self.y_train, train_sizes=train_sizes, cv=cv,
                                                            scoring=scoring, n_jobs=-1, shuffle=shuffle,
                                                            random_state=random_state)
    mean_train_score = np.mean(train_scores, axis=1)
    std_train_score = np.std(train_scores, axis=1)
    mean_test_score = np.mean(test_scores, axis=1)
    std_test_score = np.std(test_scores, axis=1)

    fig = go.Figure()

    fig.add_trace(go.Scatter(x=train_sizes, y=mean_train_score, name='train score', line=dict(color='royalblue')))
    fig.add_trace(go.Scatter(x=train_sizes, y=mean_train_score + std_train_score, mode="lines", showlegend=False, line=dict(width=0)))
    fig.add_trace(go.Scatter(x=train_sizes, y=mean_train_score - std_train_score, fill='tonexty', showlegend=False, 
      fillcolor='rgba(65,105,225,0.2)',
      line_color='rgba(255,255,255,0)',))

    fig.add_trace(go.Scatter(x=train_sizes, y=mean_test_score, name='cross-validation score', line=dict(color='firebrick')))
    fig.add_trace(go.Scatter(x=train_sizes, y=mean_test_score + std_test_score, mode="lines", showlegend=False, line=dict(width=0)))
    fig.add_trace(go.Scatter(x=train_sizes, y=mean_test_score - std_test_score, fill='tonexty', showlegend=False, 
      fillcolor='rgba(255,107,107,0.2)',
      line_color='rgba(255,255,255,0)',))

    model_name = str(clf["clf"].__class__.__name__)
    fig.update_layout(title=f'Learning Curve for {model_name}',
                   xaxis_title='Train set size',
                   yaxis_title='Accuracy', 
                   legend=dict(
                    orientation="h",
                    yanchor="bottom",
                    y=-0.2,
                    xanchor="left",
                    x=0.01
                  ))
    fig.show()

  def _plot_validation_curve(self, clf, parameters, validation_parameter, scoring='accuracy', cv=5):
    param_range = parameters[validation_parameter]
    train_scores, test_scores = validation_curve(clf, self.x_train, self.y_train, param_name=validation_parameter, param_range=param_range,
                                                  cv=cv, scoring=scoring, n_jobs=-1)
    mean_train_score = np.mean(train_scores, axis=1)
    std_train_score = np.std(train_scores, axis=1)
    mean_test_score = np.mean(test_scores, axis=1)
    std_test_score = np.std(test_scores, axis=1)

    fig = go.Figure()

    fig.add_trace(go.Scatter(x=param_range, y=mean_train_score, name='train score', line=dict(color='royalblue')))
    fig.add_trace(go.Scatter(x=param_range, y=mean_train_score + std_train_score, mode="lines", showlegend=False, line=dict(width=0)))
    fig.add_trace(go.Scatter(x=param_range, y=mean_train_score - std_train_score, fill='tonexty', showlegend=False, 
      fillcolor='rgba(65,105,225,0.2)',
      line_color='rgba(255,255,255,0)',))

    fig.add_trace(go.Scatter(x=param_range, y=mean_test_score, name='cross-validation score', line=dict(color='firebrick')))
    fig.add_trace(go.Scatter(x=param_range, y=mean_test_score + std_test_score, mode="lines", showlegend=False, line=dict(width=0)))
    fig.add_trace(go.Scatter(x=param_range, y=mean_test_score - std_test_score, fill='tonexty', showlegend=False, 
      fillcolor='rgba(255,107,107,0.2)',
      line_color='rgba(255,255,255,0)',))

    param_name = str(validation_parameter).replace("clf__", "")
    model_name = str(clf["clf"].__class__.__name__)
    fig.update_layout(title=f'Validation Curve for {model_name}',
                    xaxis_title=param_name,
                    yaxis_title='Accuracy', 
                    legend=dict(
                      orientation="h",
                      yanchor="bottom",
                      y=-0.2,
                      xanchor="left",
                      x=0.01
                    ))
    fig.show()

## Pre-processing

In [None]:
df_players_complete = pd.read_csv("../datasets/players_classification.csv", index_col=0)

### Label computation

Here we consider different ways of calculating the label for the classification

#### Median splitting

In [None]:
df_players_complete['is_high_ranked'] = np.digitize(df_players_complete['mean_rank_points'], bins=[df_players_complete['mean_rank_points'].median()])
print(df_players_complete['is_high_ranked'].value_counts())
px.histogram(df_players_complete, x="mean_rank_points", color="is_high_ranked", title="Histogram of mean rank points")

#### Mean splitting

In [None]:
df_players_complete['is_high_ranked'] = np.digitize(df_players_complete['mean_rank_points'], bins=[df_players_complete['mean_rank_points'].mean()])
print(df_players_complete['is_high_ranked'].value_counts())
px.histogram(df_players_complete, x="mean_rank_points", color="is_high_ranked", title="Histogram of mean rank points")

#### Pareto splitting

In [None]:
# lowest 80% of mean rank points are considered low ranked
df_players_complete['is_high_ranked'] = np.digitize(df_players_complete['mean_rank_points'], bins=[df_players_complete['mean_rank_points'].quantile(0.8)])
print(df_players_complete['is_high_ranked'].value_counts())
px.histogram(df_players_complete, x="mean_rank_points", color="is_high_ranked", title="Histogram of mean rank points with 80% quantile")

#### Choice of the label to use

In this case we opt to use splitting with the mean, because it follows a Pareto distribution and best represents tennis but also other competitive games.

In [None]:
classes = ['low_ranked', 'high_ranked']
df_players = df_players_complete.copy()
df_players['is_high_ranked'] = np.digitize(df_players['mean_rank_points'], bins=[df_players['mean_rank_points'].mean()])
df_players[['mean_rank_points', 'is_high_ranked']]
df_players

### Feature choice

In [None]:
df_players_complete.info()

We consider only numerical features and remove those that are derived from mean_rank_points

In [None]:
# drop correlated features with label
features_correlated_with_high_ranked = ['max_rank_points', 'last_rank_points', 'variance_rank_points']

# drop all columns with missing values
df_players = df_players.dropna(axis=1)
features_with_missing_values = ['minutes_entropy']  # this one is computed over minutes that contains a lot of missing values

# consider only numeric features
df_players = df_players.select_dtypes(include=['int64', 'float64'])

features_to_drop = features_correlated_with_high_ranked + features_with_missing_values
df_players.drop(columns=features_to_drop, inplace=True)

# Drop label
df_players.drop(columns=['mean_rank_points'], inplace=True)

df_players.info()

## Classification

Here we use the support class for classification which divides the dataset into training (70%) and testing (30%).

In [None]:
X = df_players.drop(columns=['is_high_ranked'])
Y = df_players['is_high_ranked']
classification = Classification(X, Y, classes = ['low_ranked', 'high_ranked'])

### Decision Tree

In [None]:
parameters = {'max_depth': [2,3,4,5,6,7,8,9,10,12,14], 'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], 'min_samples_split': [3, 5, 7, 9, 12], 'min_samples_leaf': [3, 5, 7, 9, 12], 'max_features': ['sqrt', 'log2', None]}
best_model = classification.show_report("Decision Tree", DecisionTreeClassifier(), parameters, validation_parameter='max_depth', normalization=False, best_model_position=9)

#### Explanation

In [None]:
cdot_data = tree.export_graphviz(best_model["clf"], out_file=None,
                         feature_names=list(classification.x_train.columns),
                         class_names=classes,
                         filled=True, rounded=True, max_depth=3)
graph = pydotplus.graph_from_dot_data(cdot_data)
Image(graph.create_png())

#### Feature importance

In [None]:
# get the names of the most important features according to the model
# map the feature importances to the feature names
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': best_model["clf"].feature_importances_})
# sort according to the importance
feature_importances.sort_values('importance', ascending=False).reset_index(drop=True)

### Rule based (RIPPER)

In [None]:
parameters = {"prune_size": [0.33, 0.5, 0.8], "k": [1, 2]}
best_model = classification.show_report("Ripper", lw.RIPPER(), parameters, validation_parameter='prune_size', normalization=False)

In [None]:
best_model["clf"].out_model()

### Random forest

In [None]:
parameters = {'max_depth': [2,3,4,5,6,7,8,9,10], 'n_estimators': [10, 20, 50, 100], 'max_features': range(1, len(classification.x_train.iloc[0]) + 1), 'bootstrap':[True, False], 'min_samples_leaf': [10, 20, 30, 40, 50], 'min_samples_split': [10, 20, 30, 40, 50]}
parameters = {'max_depth': [2,3,4,5,6,7,8,9,10], 'n_estimators': [10, 20, 50, 100], 'max_features': ['sqrt', 'log2', None], 'min_samples_leaf': [10, 20, 30, 40, 50], 'min_samples_split': [10, 20, 30, 40, 50]}
best_model = classification.show_report("Random Forest", RandomForestClassifier(), parameters, validation_parameter='max_depth', normalization=False)

#### Feature importance

In [None]:
forest = best_model[1][0]
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest], axis=0)

# create a dataframe with features and their importance
feature_importances = pd.DataFrame({'feature': classification.x_train.columns, 'importance': importances})
features_importances = feature_importances.sort_values(by='importance', ascending=True).reset_index(drop=True)
# feature_importances = feature_importances[feature_importances['importance'] > 0.01]
fig = go.Figure()
fig.add_trace(go.Bar(
    x=features_importances["feature"], y=feature_importances["importance"],
    error_y=dict(type='data', array=std)
))
fig.show()

### AdaBoost

In [None]:
parameters = {'n_estimators': [10, 20, 50, 100], 'algorithm': ['SAMME', 'SAMME.R'], 'learning_rate': [0.05, 0.1, 0.2, 0.3, 0.5, 0.7, 1.0]}
best_model = classification.show_report("AdaBoost", AdaBoostClassifier(), parameters, validation_parameter='n_estimators', normalization=False)

### KNN

In [None]:
# Use SQRT heuristic on train set to find the optimal K
k_euristic = int(np.sqrt(len(classification.x_train))) + 1
# add 20% to k_euristic to get the upper bound
k_euristic = int(k_euristic * 1.2)
k_range = list(range(1, k_euristic, 2))

parameters = {'n_neighbors': k_range, 'weights':['uniform', 'distance'], 'algorithm': ['ball_tree', 'kd_tree', 'brute'], 'metric': ['euclidean', 'manhattan', 'minkowski', 'chebyshev']}
best_model = classification.show_report("KNN", KNeighborsClassifier(), parameters, validation_parameter='n_neighbors', normalization=True)

### Naive Bayes

In [None]:
parameters = {}
best_model = classification.show_report("Naive Bayes", GaussianNB(), parameters, validation_parameter='', normalization=False)

### Support Vector Machine

In [None]:
parameters = {'C': [0.1, 1, 10, 100, 1000], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
best_model = classification.show_report("SVM", SVC(probability=True), parameters, validation_parameter='C', normalization=True)

### Neural Networks

In [None]:
parameters = {"solver": ['lbfgs', 'sgd', 'adam'], "alpha": [0.0001, 0.00001], "hidden_layer_sizes": [(10,), (20, )], "activation": ['tanh', 'relu'], "learning_rate": [ 'invscaling', 'adaptive'], "max_iter": [200, 400, 600, 800]}
best_model = classification.show_report("Neural Network", MLPClassifier(), parameters, validation_parameter='max_iter', normalization=True)

## Result analysis

### Comparison (ROC)

The ROC curve in this can be optimistic for severely imbalanced datasets like in this test set but will still show a relative change with better performing models.

In [None]:
fig = go.Figure()
fig.add_shape(type="line", x0=0, y0=0, x1=1, y1=1, line=dict(color="RoyalBlue",width=3, dash="dash"))

models = classification.get_best_models()

for i in range(len(models)):
    y_score = models[i][1].predict_proba(classification.x_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(classification.y_test, y_score)
    auc_score = roc_auc_score(classification.y_test, y_score)
    models[i] += (auc_score,)
    
# Sort according to AUC score
models.sort(key=lambda x: x[2], reverse=True)  
for i in range(len(models)):
    model = models[i]
    if model[0] == "Ripper":
        continue
        
    y_score = model[1].predict_proba(classification.x_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(classification.y_test, y_score)
    name = f"{model[0]} - AUC={model[2]:.3f}"
    fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines'))

fig.update_layout(
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    yaxis=dict(scaleanchor="x", scaleratio=1),
    xaxis=dict(constrain='domain'),
    width=700, height=500)
fig.show()

### Accuracy of all models on train and test set

In [None]:
df_accuracy_train = pd.DataFrame()
df_accuracy_train['Model'] = [model[0] for model in models]
df_accuracy_train['Accuracy'] = [model[1].score(classification.x_train, classification.y_train) for model in models]
df_accuracy_train['Set'] = ['Train' for model in models]

df_accuracy_test = pd.DataFrame()
df_accuracy_test['Model'] = [model[0] for model in models]
df_accuracy_test['Accuracy'] = [model[1].score(classification.x_test, classification.y_test) for model in models]
df_accuracy_test['Set'] = ['Test' for model in models]

# merge the dataframes
df_accuracy = pd.concat([df_accuracy_train, df_accuracy_test]).reset_index(drop=True)
df_accuracy = df_accuracy.sort_values(by=['Set', 'Accuracy', 'Model'], ascending=[1, 0, 1]).reset_index(drop=True)
df_accuracy['Accuracy'] = df_accuracy['Accuracy'].round(4)

# plotly express barcharth for each model with the train and test accuracy
fig = px.bar(df_accuracy, x="Model", y="Accuracy", color="Set", barmode="group", text_auto=True, color_discrete_sequence=["firebrick", "royalblue"])
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=-0.2,
    xanchor="left",
    x=0.01
))
fig.show()

### F1, Accuracy, Precision, Recall on test set

In [None]:
df_metrics = pd.DataFrame()
df_metrics['Model'] = [model[0] for model in models]

df_f1 = pd.DataFrame()
df_f1['Model'] = [model[0] for model in models]
df_f1['Score'] = [f1_score(classification.y_test, model[1].predict(classification.x_test)) for model in models]
df_f1['Metric'] = ['F1' for model in models]
df_f1['Score'] = df_f1['Score'].round(4)
df_f1 = df_f1.sort_values(by="Score", ascending=False)

df_accuracy_test = pd.DataFrame()
df_accuracy_test['Model'] = [model[0] for model in models]
df_accuracy_test['Score'] = [model[1].score(classification.x_test, classification.y_test) for model in models]
df_accuracy_test['Metric'] = ['Accuracy' for model in models]

# precision
df_precision = pd.DataFrame()
df_precision['Model'] = [model[0] for model in models]
df_precision['Score'] = [precision_score(classification.y_test, model[1].predict(classification.x_test)) for model in models]
df_precision['Metric'] = ['Precision' for model in models]

#recall
df_recall = pd.DataFrame()
df_recall['Model'] = [model[0] for model in models]
df_recall['Score'] = [recall_score(classification.y_test, model[1].predict(classification.x_test)) for model in models]
df_recall['Metric'] = ['Recall' for model in models]

df_metrics = pd.concat([df_f1, df_accuracy_test, df_precision, df_recall]).reset_index(drop=True)
df_metrics["Score"] = df_metrics["Score"].round(4)

fig = px.bar(df_metrics, x="Model", y="Score", color="Metric", barmode="group", text_auto=True, color_discrete_sequence=["firebrick", "royalblue", "#FFBF00", "#32936F"])
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=-0.2,
    xanchor="left",
    x=0.01
))
fig.show()

### PCA Visualization of the best model

This reminds us of the results obtained in the clustering analysis

In [None]:
# Search for the model with the best AUC score
best_model = max(models, key=lambda x: x[2])

PCAdata = MinMaxScaler().fit_transform(classification.x_train)
X_r = pd.DataFrame(PCA(n_components=2).fit_transform(PCAdata))
prediction = best_model[1].predict_proba(classification.x_train)

fig = px.scatter(x=X_r[0], y=X_r[1], color=prediction[:, 1], color_continuous_scale='RdBu', symbol=classification.y_train, symbol_map={'0': 'square-dot', '1': 'circle-dot'},  labels={'symbol': 'label', 'color': 'score of <br>first class'})
fig.update_traces(marker_size=12, marker_line_width=1.5)
fig.update_layout(title=f"PCA visualization for {best_model[0]}", legend_orientation='h')
fig.show()

### Comparison with K-means

We can see how well k-means identifies groups of strong and weak players

In [None]:
# Apply k-means to identify cluster of good and bad players
from sklearn.cluster import KMeans

feautures = ['max_tourney_revenue', 'mean_rank_points', 'lrpOnMxrp', 'matches_won_ratio']
df_data = df_players_complete[feautures].reset_index(drop=True)
df_data = pd.DataFrame(MinMaxScaler().fit_transform(df_data), columns=df_data.columns)
df_data = df_data.round(3)
kmeans = KMeans(n_clusters=2, n_init=10, max_iter=100, init="k-means++", random_state=42).fit(df_data)
df_players_complete['cluster'] = kmeans.labels_

df_players_complete["classification"] = best_model[1].predict(df_players.drop(columns="is_high_ranked"))

# Show confusion matrix to see the intersection between classification and cluster
cm = confusion_matrix(df_players_complete["is_high_ranked"], df_players_complete["cluster"])
fig = ff.create_annotated_heatmap(cm[[1, 0]], x=["Low ranked","High ranked"], y=["High ranked", "Low ranked"], colorscale='Blues', showscale=True)
fig.update_layout(xaxis = dict(title='K-means value'), yaxis = dict(title='Real value'))
fig.show()