# 7. COSINE SIMILARITY

In [1]:
import fastf1
import pickle
import os
import copy
import calendar
import numpy as np
import pandas as pd

#Visualization
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram, linkage
import scipy.cluster.hierarchy as sch
import scipy.spatial.distance as ssd


parent_dir = os.path.dirname(os.path.realpath("7. Cosine Similarity.ipynb")) 
data_path = os.path.join(parent_dir, 'Data Objects')

In [2]:
with open(os.path.join(data_path, 'pilots22.pkl'), 'rb') as f:
    pilots22 = pickle.load(f)

with open(os.path.join(data_path, 'pilots23.pkl'), 'rb') as f:
    pilots23 = pickle.load(f)

### 7.1 PCA Dimensionality Reduction

#### 7.1.1 Optimal Number of Components

In [3]:
def find_optimal_pca(data,graph=False):
    """
    Determines the optimal number of PCA components to explain at least 90% of the variance.
    
    Parameters:
    - data: array-like, shape (n_samples, n_features)
        The input data to perform PCA on.
    - graph: bool, optional (default=False)
        If True, displays an interactive Plotly graph showing the cumulative explained variance.
    
    Returns:
    - n_components: int
        The number of components to explain at least 90% of the variance.
    """
    pca = PCA()
    pca.fit(data)
    
    explained_variance_ratio = pca.explained_variance_ratio_
    
    cumulative_explained_variance = np.cumsum(explained_variance_ratio)
    
    if graph:
        fig = go.Figure()
        
        fig.add_trace(go.Scatter(
            x=np.arange(1, len(cumulative_explained_variance) + 1),
            y=cumulative_explained_variance,
            mode='lines+markers',
            name='Cumulative Explained Variance'
        ))
        
        fig.add_trace(go.Scatter(
            x=[1, len(cumulative_explained_variance)],
            y=[0.9, 0.9],
            mode='lines',
            line=dict(color='red', dash='dash'),
            name='90% Explained Variance'
        ))
        
        fig.update_layout(
            title='Explained Variance vs. Number of Components',
            xaxis_title='Number of Components',
            yaxis_title='Cumulative Explained Variance',
            showlegend=True
        )
        
        fig.show()
    
    n_components = np.where(cumulative_explained_variance >= 0.9)[0][0] + 1
    print(f"Number of components to explain 90% of the variance: {n_components}")
    return n_components

In [None]:
dt=pilots22['Bahrain_Grand_Prix']['14'].T
n=find_optimal_pca(dt,True)

In [4]:
def components_by_season(pilots_dict): 
    """
    Calculate the average number of PCA components for each Grand Prix event.

    Parameters:
    - pilots_dict: dict
        A dictionary where keys are Grand Prix events and values are dictionaries containing data for different drivers.

    Returns:
    - component_dict: dict
        A dictionary where keys are Grand Prix events and values are the average number of PCA components.
    """
    component_dict = {}
    for gp, gp_dict in pilots_dict.items():
        components = []
        for driver, driver_df in gp_dict.items():
            n = find_optimal_pca(driver_df.T) 
            components.append(n)
            
        component_dict[gp] = np.average(components)
        
    return component_dict

In [None]:
com22 = components_by_season(pilots22)
"""
grand_prix = list(com22.keys())
values = list(com22.values())
fig = go.Figure(data=go.Scatter(x=grand_prix, y=values, mode='lines+markers'))

fig.update_layout(
    title='2022 Grand Prix PCA Component Values',
    xaxis_title='Grand Prix',
    yaxis_title='Value',
    xaxis_tickangle=-45  
)

fig.show()
"""

In [None]:
com23 = components_by_season(pilots23)
"""
grand_prix = list(com23.keys())
values = list(com23.values())
fig = go.Figure(data=go.Scatter(x=grand_prix, y=values, mode='lines+markers'))

fig.update_layout(
    title='2023 Grand Prix PCA Component Values',
    xaxis_title='Grand Prix',
    yaxis_title='Value',
    xaxis_tickangle=-45  
)

fig.show()
"""

In [4]:
com22=1
com23=1

#### 7.1.2 Applying PCA

In [5]:
def apply_pca(pilots_dict, com):
    pilots_pca = {}
    for gp, gp_dict in pilots_dict.items():
        #n = round(com[gp])
        n=8
        pilots_pca[gp] = {}  
        for driver, driver_df in gp_dict.items():
            pca = PCA(n_components=n)
            data = driver_df.iloc[:,3:].T
            pilots_pca[gp][driver] = pca.fit_transform(data)
            
    return pilots_pca

In [6]:
pilots22_pca = apply_pca(pilots22,com22)
pilots22_pca = dict(sorted(pilots22_pca.items()))

pilots23_pca = apply_pca(pilots23,com23)
pilots23_pca = dict(sorted(pilots23_pca.items()))

### 7.2 Cosine Similarity

In [7]:
def cos_similarity(dictt,gp):
    # TELEMETRY AND POSITION
    pilot_ids = list(dictt[gp].keys())
    cosine_sim_matrix = pd.DataFrame(index=pilot_ids, columns=pilot_ids)
    
    for i in range(len(pilot_ids)):
        for j in range(i + 1, len(pilot_ids)):
            pilot1, pilot2 = pilot_ids[i], pilot_ids[j]
            similarity_matrix = cosine_similarity(dictt[gp][pilot1],dictt[gp][pilot2])
            similarity = np.mean(similarity_matrix)
            # or I can flatten the cosine similarity elements, then calculate their similarity
            
            cosine_sim_matrix.loc[pilot1, pilot2] = similarity
            cosine_sim_matrix.loc[pilot2, pilot1] = similarity
            
    np.fill_diagonal(cosine_sim_matrix.values, 1) #setting diagonal elements to 1
    return cosine_sim_matrix

In [8]:
gp = 'Monaco_Grand_Prix'
cos_sim22 = cos_similarity(pilots22_pca,gp)
#cos_sim22 #symmetric and diagonals are 1 

In [9]:
cos_sim23 = cos_similarity(pilots23_pca,gp)

Converting cosine similarity matrix to a distance matrix (necessary for clustering)

Clustering algorithms, rely on distances between points to determine how to group them.

In [10]:
dist_matrix22 = (1 - cos_sim22).to_numpy()

dist_matrix23 = (1 - cos_sim23).to_numpy()

A condensed distance matrix is a 1D array representing the upper triangular portion of a 2D distance matrix (excluding the diagonal).

scipy.cluster.hierarchy.linkage requires a condensed distance matrix, which is way we converted it.

In [11]:
condensed_dist22 = ssd.squareform(dist_matrix22)

condensed_dist23 = ssd.squareform(dist_matrix23)

In [12]:
 with open(os.path.join(data_path,'condensed_dist22.pkl'), 'wb') as f:
    pickle.dump(condensed_dist22, f)
    
with open(os.path.join(data_path,'condensed_dist23.pkl'), 'wb') as f:
    pickle.dump(condensed_dist23, f)
    
with open(os.path.join(data_path,'cos_sim22.pkl'), 'wb') as f:
    pickle.dump(cos_sim22, f)
    
with open(os.path.join(data_path,'cos_sim23.pkl'), 'wb') as f:
    pickle.dump(cos_sim23, f)
    
with open(os.path.join(data_path,'dist_matrix22.pkl'), 'wb') as f:
    pickle.dump(dist_matrix22, f)
    
with open(os.path.join(data_path,'dist_matrix23.pkl'), 'wb') as f:
    pickle.dump(dist_matrix23, f)