In [1]:
import pandas as pd
import psycopg2
from config import *

In [2]:
conn = psycopg2.connect(
   database=DATABASE, user=USER, password=PASSWORD, host=HOST, port= PORT
)

In [3]:
cursor = conn.cursor()

In [4]:
cursor.execute("Select * FROM races LIMIT 0")
colnames = [desc[0] for desc in cursor.description]
colnames

['raceId', 'year', 'round', 'circuitId', 'name', 'date', 'time', 'url']

In [5]:
cursor.execute("SELECT * FROM races")
data = cursor.fetchall()
race_df = pd.DataFrame(data, columns=colnames)
race_df.head()

Unnamed: 0,raceId,year,round,circuitId,name,date,time,url
0,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,http://en.wikipedia.org/wiki/2009_Australian_G...
1,2,2009,2,2,Malaysian Grand Prix,2009-04-05,09:00:00,http://en.wikipedia.org/wiki/2009_Malaysian_Gr...
2,3,2009,3,17,Chinese Grand Prix,2009-04-19,07:00:00,http://en.wikipedia.org/wiki/2009_Chinese_Gran...
3,4,2009,4,3,Bahrain Grand Prix,2009-04-26,12:00:00,http://en.wikipedia.org/wiki/2009_Bahrain_Gran...
4,5,2009,5,4,Spanish Grand Prix,2009-05-10,12:00:00,http://en.wikipedia.org/wiki/2009_Spanish_Gran...


In [6]:
race_df = race_df[race_df['year']>=1990]
race_df.shape

(574, 8)

In [7]:
race_df = race_df.groupby("circuitId").filter(lambda x: len(x) > 9)
race_df.shape

(470, 8)

In [8]:
# file_path = "../Resources/PythonExport/races_modern_high_freq.csv"
# race_df = pd.read_csv(file_path)
# race_df.head()

In [9]:
cursor.execute("Select * FROM results LIMIT 0")
colnames = [desc[0] for desc in cursor.description]
colnames

['resultId',
 'raceId',
 'driverId',
 'constructorId',
 'number',
 'grid',
 'position',
 'positionText',
 'positionOrder',
 'points',
 'laps',
 'time',
 'milliseconds',
 'fastestLap',
 'rank',
 'fastestLapTime',
 'fastestLapSpeed',
 'statusId']

In [10]:
cursor.execute("SELECT * FROM results")
data = cursor.fetchall()
results_df = pd.DataFrame(data, columns=colnames)
results_df.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,1,18,1,1,22,1,1,1,1,10.0,58,1:34:50.616,5690616,39,2,1:27.452,218.3,1
1,2,18,2,2,3,5,2,2,2,8.0,58,+5.478,5696094,41,3,1:27.739,217.586,1
2,3,18,3,3,7,7,3,3,3,6.0,58,+8.163,5698779,41,5,1:28.090,216.719,1
3,4,18,4,4,5,11,4,4,4,5.0,58,+17.181,5707797,58,7,1:28.603,215.464,1
4,5,18,5,1,23,3,5,5,5,4.0,58,+18.014,5708630,43,1,1:27.418,218.385,1


In [None]:
# file_path = "../Resources/Dataset/results.csv"
# results_df = pd.read_csv(file_path)
# results_df.head()

In [None]:
raceresults_df = results_df.merge(race_df,left_on='raceId',right_on='raceId',how='left')

In [None]:
raceresults_df = raceresults_df.dropna().reset_index()

In [None]:
raceresults_df.head()

In [None]:
raceresults_df.drop(columns=['url','time_y','date','round','round','positionOrder','positionText','position','grid','number','points','laps','time_x','milliseconds','fastestLap','fastestLapSpeed','rank','fastestLapTime'],inplace=True)

In [None]:
raceresults_df.head()

In [None]:
from sklearn.cluster import KMeans

In [None]:
km = KMeans(n_clusters=3)
y_predicted = km.fit_predict(raceresults_df[['statusId','constructorId','circuitId']])
y_predicted

In [None]:
raceresults_df['cluster'] = y_predicted
raceresults_df

In [None]:
import plotly.express as px


In [None]:
fig = px.scatter_3d(
    raceresults_df,
    x="statusId",
    y="constructorId",
    z="circuitId",
    color="cluster",
    symbol="cluster",
    width=800,
#     hover_name="CoinName",
#     hover_data=["Algorithm"],
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [None]:
import hvplot.pandas

In [None]:
raceresults_df.hvplot.scatter(x="statusId", y="constructorId", by="cluster")


In [None]:
import plotly.figure_factory as ff

In [None]:
fig = ff.create_dendrogram(raceresults_df[['statusId','constructorId','circuitId']], color_threshold=10)
fig.update_layout(width=800, height=500)
fig.show()

In [None]:
from sklearn.cluster import AgglomerativeClustering

agg = AgglomerativeClustering(n_clusters=3)
model = agg.fit(raceresults_df[['statusId','constructorId','circuitId']])

In [None]:
raceresults_df['class']=model.labels_
raceresults_df.head()

In [None]:
raceresults_df.hvplot.scatter(x="statusId", y="constructorId",by='class')