In [None]:
# Imports
#from datetime import datetime
import datetime as dt

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import seaborn as sns
import matplotlib.pyplot as plt
import math
#import plotly
import plotly.graph_objs as go
import plotly.plotly as py
import plotly.tools as tls
import plotly.figure_factory as ff

'''For ML:'''
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

#plotly.__version__


In [None]:
# List of Datafiles
print(os.listdir("../input"))

In [None]:
pitstops = pd.read_csv('../input/pitStops.csv')
results = pd.read_csv('../input/results.csv')
races = pd.read_csv('../input/races.csv')
circuits = pd.read_csv('../input/circuits.csv', encoding='latin1')
drivers = pd.read_csv('../input/drivers.csv', encoding='latin1')

#identify yellow flag
laptimes = pd.read_csv('../input/lapTimes.csv')
laptimes.head()


In [None]:
# Time Behind Leader
laptimes.sort_values(by = ['raceId', 'driverId', 'lap'], inplace=True)

laptimes.head()
#calculating the "totalmilli" and creating apropriate column for it in the df
laptimes['totalmilli'] = laptimes.groupby(['raceId', 'driverId'])['milliseconds'].transform(pd.Series.cumsum)


In [None]:
# Creating the copies to mearge:
laptimes_2 = laptimes[['raceId', 'lap', 'position', 'totalmilli']].copy()
laptimes_3 = laptimes[['raceId', 'lap', 'position', 'totalmilli']].copy()

# Adding and subtractin "1" to each position, so than we can merge the "correct" position with the one in front of it:
laptimes_2['position'] = laptimes_2['position'] + 1
laptimes_2.rename(columns={'position': "position_plus_1", 'totalmilli' : 'totalmilli_plus_1'}, inplace=True)

laptimes_3['position'] = laptimes_3['position'] -1
laptimes_3.rename(columns={'position': "position_min_1", 'totalmilli' : 'totalmilli_min_1'}, inplace=True)

# Mearging two dataframes:
merged = pd.merge(laptimes, laptimes_2, how = 'left', left_on=['raceId', 'lap', 'position'],
                  right_on=['raceId', 'lap', 'position_plus_1'])

# Mearging two dataframes:
merged = pd.merge(merged, laptimes_3, how = 'left', left_on=['raceId', 'lap', 'position'],
                  right_on=['raceId', 'lap', 'position_min_1'])

# Calculating how far each car behind/in front:
merged['to_in_front'] = merged['totalmilli'] - merged['totalmilli_plus_1']
merged['to_behind'] = merged['totalmilli_min_1'] - merged['totalmilli']

In [None]:
#Checking Results of Time Between
# 'to_previous' has to be >= 0:
print("positive:", merged[merged['to_in_front']>0].shape)
print("equal zero:", merged[merged['to_in_front']==0].shape)
print('less than zero', merged[merged['to_in_front']<0].shape)

In [None]:
# Now we can delete 'position_plus_1' and 'totalmilli_plus_1' columns if needed.
merged.drop(['position_plus_1', 'totalmilli_plus_1', 'position_min_1', 'totalmilli_min_1'], axis=1, inplace = True)
# Puting merged df into laptimes
laptimes = merged.copy()
laptimes.head()


## Creating parameters:

### DBs descriptions:

pitstops: 'raceId', 'driverId', 'stop'(order number), 'lap', 'time'(real time), 'duration',
       'milliseconds'
       
results: 'resultId', 'raceId', 'driverId', 'constructorId'??, 'number', 'grid',
       'position'(many NaN), 'positionText'(has letters R, D), 'positionOrder'(only numbers),
        'points', 'laps', 'time', 'milliseconds'(total time for the race), 'fastestLap' (order number),
        'rank'?, 'fastestLapTime', 'fastestLapSpeed', 'statusId'??
        
races: 'raceId', 'year', 'round', 'circuitId', 'name', 'date', 'time', 'url' (just name, date, Wiki page
                                                                             of the race)
                                                                             
circuits: 'circuitId', 'circuitRef', 'name', 'location', 'country', 'lat', 'lng',
       'alt', 'url'
       
drivers: 'driverId', 'driverRef', 'number', 'code', 'forename', 'surname', 'dob',
       'nationality', 'url'

### Calculating the average (finish) position for each driver:

In [None]:
'''To calculate the average position at the finish we take only races that were finished by the driver'''
avg_position = results[results['milliseconds'].notnull()].groupby(['driverId'])['position'].mean()
avg_position = avg_position.to_frame()
avg_position.columns = ['avg_position']

In [None]:
'''Puting avg_position into seperate column in the results df'''
results = results.merge(avg_position, left_on='driverId', right_index=True)

### Calculating relative position at every lap:

In [None]:
laptimes = laptimes.merge(avg_position, left_on='driverId', right_index=True)
laptimes['relative_to_avg'] = laptimes['avg_position'] - laptimes['position']
laptimes.head()

'''Notes:'''

pitstops_df - only has data for 841-988 raceId
    pitstops['raceId'].hist()
laptimes_df - strange (but the last interval = 841-988 )
    laptimes['raceId'].hist(bins = 100)
results-df - fine
    results['raceId'].hist(bins = 100)
    
'''Starts from the 2nd pitstop'''
laptimes[(laptimes['raceId']==908)&(laptimes['driverId']==820)].head()

'''Maybe add avg_ps_duration?'''

### Merging pitstops_df with laptime_df:

In [None]:
pitstops.rename(columns = {'stop':'ps_order', 'time':'exact_ps_time', 'milliseconds':'ps_duration'}, inplace=True)

'''Take only races for which we hae ps data (#841-988) and drop "duration" column from pitstops_df'''
laptimes = laptimes[laptimes['raceId']>=841].merge(pitstops.drop(['duration'], axis=1),
                                                    how='left',on =['raceId','driverId','lap'])

### Calculating 'minLaps' and 'yellowTreashold'

In [None]:
'''Getting the fastest laps times:'''
minLaps = laptimes.groupby(['raceId', 'lap'])['milliseconds'].min().reset_index()
minLaps.head()

# '''Getting the fastest laps in races times:'''
BestLapinRaces = laptimes.groupby(['raceId'])['milliseconds'].min().reset_index()
BestLapinRaces.head()

# '''Calculating yellowThreshold = fastest lap in race * 1.1'''
BestLapinRaces['yellowThreshold'] = BestLapinRaces['milliseconds'] * 1.1 
BestLapinRaces.head()

# '''flag if yellowThreshold > the best lap time'''
minLaps = minLaps.merge(BestLapinRaces[['raceId','yellowThreshold']], how = 'left', on='raceId')
minLaps['flag'] = (minLaps['yellowThreshold'] >  minLaps['milliseconds']).astype(int)

minLaps.rename(index=str, columns={"milliseconds": "minLap"}, inplace=True)

In [None]:
'''Merge laptimes_df with minLaps_df'''
laptimes = laptimes.merge(minLaps, how='left', on=['raceId', 'lap'])

### Calculating previous lap times
We take the times in milliseconds for the previous 2 lapses. So, we observe the cars form a 3rd lap. Althoug there are cases, when the drivers did a pit stop at the 1st or 2nd lap, we ignore them.

In [None]:
'''Graph to better understand the number of pit stops that are ignored:'''
laptimes[(laptimes['ps_order'].notnull())&(laptimes['lap'] < 10)]['lap'].hist(bins=20)
plt.title('Pitstops')
plt.xlabel('lap')
plt.ylabel('number of pit stops')

### Getting the time for the previous two laps and puting those into seperate columns:

In [None]:
'''Creating df to get time_min_1 and put it into laptimes_df'''
lap_min_1 = laptimes.groupby(['raceId', 'driverId', 'lap'])['milliseconds'].first().to_frame()
lap_min_1.reset_index(inplace=True)
lap_min_1['lap'] = lap_min_1['lap'] + 1
lap_min_1.rename(columns={"milliseconds": "milli_for_min_1"}, inplace=True)

'''Creating df to get time_min_2 and put it into laptimes_df'''
lap_min_2 = laptimes.groupby(['raceId', 'driverId', 'lap'])['milliseconds'].first().to_frame()
lap_min_2.reset_index(inplace=True)
lap_min_2['lap'] = lap_min_2['lap'] + 2
lap_min_2.rename(columns={"milliseconds": "milli_for_min_2"}, inplace=True)

'''Merging 3 dataframes'''
laptimes = laptimes.merge(lap_min_1, how='left', on=['raceId', 'driverId', 'lap'])
laptimes = laptimes.merge(lap_min_2, how='left', on=['raceId', 'driverId', 'lap'])

### Calculating time since last ps
We assume that the firs pit stop is done at the start of each race by each driver

In [None]:
laptimes['since_last_ps'] = np.nan
for index, row in laptimes[laptimes['ps_order'].notnull()].iterrows():
    if row['ps_order'] == 1: # and row['lap'] not in [1,2]:
        since_last_ps = row['totalmilli']
    elif row['ps_order'] != 1 and index != 115486: # 115486 because the data issue,see below
        since_last_ps = row['totalmilli'] - laptimes[(laptimes['ps_order'].notnull())&
                                                    (laptimes['raceId']==row['raceId'])&
                                                    (laptimes['driverId']==row['driverId'])&
                                                    (laptimes['ps_order']==row['ps_order']-1)
                                                    ]['totalmilli']
#     print(index, since_last_ps)
    laptimes.at[index, 'since_last_ps'] = since_last_ps

'''There is some data issue for (raceId, driverId) = (908,820)
The time for the 2nd lap is very big'''
# laptimes[(laptimes['raceId']==908)&(laptimes['driverId']==820)]

In [None]:
laptimes['since_last_ps'] = laptimes.apply(lambda x :x['milliseconds'] if math.isnan(x['since_last_ps'])
                                           else x['milliseconds'] + x['since_last_ps']*-1, axis=1)

In [None]:
# Sorting values
laptimes.sort_values(by = ['raceId', 'driverId', 'lap'], inplace=True)

'''Calculating since_last_ps (final)'''
laptimes['since_last_ps'] = laptimes.groupby(['raceId', 'driverId'])['since_last_ps'].transform(pd.Series.cumsum)

### Creating new df and shifting the data for the previous lap to mathc the pit stop

In [None]:
'''Creating new df and shifting the data for the previous lap to mathc the pit stop'''
temp_df = laptimes.copy()

min_1_df = temp_df.groupby(['raceId', 'driverId', 'lap'])['position', 'totalmilli', 'to_in_front',
                                                                    'to_behind', 'minLap', 'flag',
                                                                  'relative_to_avg', 'since_last_ps'].last()
min_1_df.reset_index(inplace=True)
min_1_df['lap'] = min_1_df['lap'] + 1
# min_1_df.rename(columns={"to_in_front": "to_in_front_min_1",
#                          'to_behind':'to_behind_min_1'}, inplace=True)
'''Merging'''
temp_df = temp_df.merge(min_1_df, how='left', on=['raceId', 'driverId', 'lap'])


# ML:

In [None]:
'''We do not observe (race, driver, lap) for drivers who are at the position #1, because we do not have
to_in_front_y data for them.
Also we do not observe to_behind_y for some (race, driver, lap).
To better understand what data we lose, below are the codes to plot distributions
(uncommnet some of the lines):'''
# temp_df[(temp_df['milli_for_min_2'].notnull())
# #         &(temp_df['to_in_front_y'].isnull())
# #         &(temp_df['to_behind_y'].isnull())
#         ]['position_y'].hist(bins=100)

### Making a function to visualize DecisionTree

In [None]:
import graphviz
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
# A function that gives a visual representation of the decision tree
def show_decision_tree(model):
    dot_data = tree.export_graphviz(ps_tree, out_file=None) 
    graph = graphviz.Source(dot_data) 
#     To save on a PDF file
#     graph.render("iris")
    return graph

In [None]:
'''getting X and y'''
'''transforming ps_order column into 1 (there was a pit stop) and 0 (there was not a pit stop)'''
temp_df['ps_order'] = temp_df['ps_order'].apply(lambda x: 0 if np.isnan(x) else 1)

X_y = temp_df[['ps_order', 'position_y', 'totalmilli_y', 'to_in_front_y', 'to_behind_y', 'minLap_y',
               'flag_y', 'relative_to_avg_y', 'milli_for_min_1','milli_for_min_2', 'since_last_ps_y']
             ].dropna()
X = X_y[['position_y', 'totalmilli_y', 'to_in_front_y', 'to_behind_y', 'minLap_y',
               'flag_y', 'relative_to_avg_y', 'milli_for_min_1','milli_for_min_2', 'since_last_ps_y'
           ]].values
y = X_y[['ps_order']].values

In [None]:
'''Splitting whole sample into train and test:'''
train_X, test_X, train_y, test_y = train_test_split(X, y, 
                                                    train_size=0.8,
                                                    test_size=0.2,
                                                    random_state=123,
                                                    stratify=y)
'''max-depth = 5'''
ps_tree = DecisionTreeClassifier(max_depth=5, criterion="gini")
ps_tree.fit(train_X, train_y)

pred_y = ps_tree.predict(test_X)

In [None]:
# using the score function in each model class
print("accuracy on the test set", ps_tree.score(test_X, test_y))
print("accuracy on the training set", ps_tree.score(train_X, train_y))

# using single metric functions in the sklearn.metrics package 
print("accuracy on the test set", accuracy_score(pred_y, test_y))

In [None]:
'''Visualizing DecisionTree'''
show_decision_tree(ps_tree)

In [None]:
'''Count number of pist stops (1) and number of laps without pit stops (0)'''
from collections import Counter
y_list = y.tolist()
y_list = [item for sublist in y_list for item in sublist]
Counter(y_list)

# END

# With limit to only top 9:

In [None]:
'''taking only laps of top 9 guys in every race''' """raceId is not consistand across the racesId!!!"""
leaders = results.loc[results['positionOrder']<10][['raceId','driverId']]
leaders.head(100)

'''Merging the previous df with laptimes'''
winLaps = laptimes.merge(leaders, on=['raceId','driverId'])

'''Getting the fastest laps times:'''
avgLaps = winLaps.groupby(['raceId', 'lap'])['milliseconds'].min().reset_index()
avgLaps.head()

'''Getting the fastest laps in races times:'''
BestLapSpeed = avgLaps.groupby(['raceId'])['milliseconds'].min().reset_index()
BestLapSpeed.head()

#WorstLapSpeed = avgLaps.groupby(['raceId'])['milliseconds'].max().reset_index()
#WorstLapSpeed = WorstLapSpeed.rename(index=str, columns={"milliseconds": "slow"})
#WorstLapSpeed.head()

#BestLapSpeed = BestLapSpeed.merge(WorstLapSpeed, on='raceId')
#BestLapSpeed['yellowThreshold'] = (BestLapSpeed['milliseconds'] + BestLapSpeed['slow'] )/2
'''Calculating yellowThreshold = fastest lap in race * 1.1'''
BestLapSpeed['yellowThreshold'] = (BestLapSpeed['milliseconds'] * 1.1) #+ BestLapSpeed['slow'] )/2 
BestLapSpeed.head()

'''flag if yellowThreshold > the best lap time'''
avgLaps = avgLaps.merge(BestLapSpeed[['raceId','yellowThreshold']], on='raceId')
avgLaps['flag'] = (avgLaps['yellowThreshold'] >  avgLaps['milliseconds']).astype(int)

avgLaps = avgLaps.rename(index=str, columns={"milliseconds": "avgLap"})

avgLaps.head()

'''Making a plot'''
sns.set(style="whitegrid")
#sns.set(rc={'figure.figsize':(6,7)})
g = sns.FacetGrid(avgLaps.head(1000), col="raceId", aspect=1 ,col_wrap=5, height=5, hue='flag')
g = g.map(sns.scatterplot, "lap", "avgLap", s=100)

#g = g.map(sns.scatterplot, "year", "lap", s=150)
#g = g.map(sns.violinplot, 'year','lap',hue="positionText" )
#ax = sns.violinplot(x="year", y="lap",  data=main, height = 50)

#sns.scatterplot(data=winners.loc[winners['raceId']==21].loc[winners['pit']==1],  x="lap", y="milliseconds_y", s=550,hue = 'pit')
#sns.scatterplot(data=winners.loc[winners['raceId']==960].loc[winners['pit']==1],  x="lap", y="milliseconds_y", s=100,hue = 'pit')
#sns.scatterplot(data=winners.loc[winners['raceId']==962],  x="lap", y="milliseconds_y", s=100,hue = 'pit')



In [None]:
laptimes2 = laptimes.merge(avgLaps, on=['raceId','lap']).reset_index()
laptimes2.head()


#winners['fastmillis'] = pd.to_datetime(winners['fastestLapTime'])
#winners.head()

#avgLaps = avgLaps.merge(winners[['raceId','fastestLapTime']], on=['raceId'])
#avgLaps.head()
winners = results.loc[results['positionOrder']<10]
winners.head()

winners = winners.merge(laptimes2, on=['raceId','driverId'])
winners.head()

winners = winners.merge(pitstops,how='outer', on =['raceId','driverId','lap'])
winners.head()
winners.loc[winners['duration'].notnull()].head()

winners['pit'] = winners['duration'].notnull().astype(int) 
winners['milliseconds'].fillna(0, inplace=True) 

winners['lap_millis'] = winners['milliseconds_y'] -  winners['milliseconds']


winners = winners.loc[winners['raceId']==952] #950
winners.loc[winners['pit']==1].head()
winners.loc[winners['flag']==0].head()

## ML:

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [None]:
'''getting X and y'''
tempdf = winners[['positionOrder', 'totalmilli', 'to_in_front', 'to_behind', 'avgLap', 'flag', 'pit']].dropna()
X = tempdf[['positionOrder', 'totalmilli', 'to_in_front', 'to_behind', 'avgLap', 'flag']].values
y = tempdf[['pit']].values

In [None]:
'''Splitting whole sample into train and test:'''
train_X, test_X, train_y, test_y = train_test_split(X, y, 
                                                    train_size=0.8,
                                                    test_size=0.2,
                                                    random_state=123,
                                                    stratify=y)

In [None]:
'''Checking if the split is correct'''
# print('All:', np.bincount(y) / float(len(y)) * 100.0)
# print('Training:', np.bincount(train_y) / float(len(train_y)) * 100.0)
# print('Test:', np.bincount(test_y) / float(len(test_y)) * 100.0)


In [None]:
tree = DecisionTreeClassifier()
tree.fit(train_X, train_y)

### Predict:

In [None]:
pred_y = tree.predict(test_X)

### Evaluate:

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [None]:
# using the score function in each model class
print("accuracy on the test set", tree.score(test_X, test_y))
print("accuracy on the training set", tree.score(train_X, train_y))

# using single metric functions in the sklearn.metrics package 
print("accuracy on the test set", accuracy_score(pred_y, test_y))

In [None]:
tempdf['pit'].value_counts()

# Without limits to top 9:

In [None]:
'''taking only laps of top 9 guys in every race''' """raceId is not consistand across the racesId!!!"""
leaders = results[['raceId','driverId']]
'''Merging the previous df with laptimes'''
winLaps = laptimes.merge(leaders, on=['raceId','driverId'])
'''Getting the fastest laps times:'''
avgLaps = winLaps.groupby(['raceId', 'lap'])['milliseconds'].min().reset_index()
avgLaps.head()
'''Getting the fastest laps in races times:'''
BestLapSpeed = avgLaps.groupby(['raceId'])['milliseconds'].min().reset_index()
BestLapSpeed.head()
'''Calculating yellowThreshold = fastest lap in race * 1.1'''
BestLapSpeed['yellowThreshold'] = (BestLapSpeed['milliseconds'] * 1.1) #+ BestLapSpeed['slow'] )/2 
BestLapSpeed.head()
'''flag if yellowThreshold > the best lap time'''
avgLaps = avgLaps.merge(BestLapSpeed[['raceId','yellowThreshold']], on='raceId')
avgLaps['flag'] = (avgLaps['yellowThreshold'] >  avgLaps['milliseconds']).astype(int)

avgLaps = avgLaps.rename(index=str, columns={"milliseconds": "avgLap"})

In [None]:
laptimes2 = laptimes.merge(avgLaps, on=['raceId','lap']).reset_index()
laptimes2.head()

winners = results
winners.head()

winners = winners.merge(laptimes2, on=['raceId','driverId'])
winners.head()

winners = winners.merge(pitstops,how='outer', on =['raceId','driverId','lap'])
winners.head()
winners.loc[winners['duration'].notnull()].head()

winners['pit'] = winners['duration'].notnull().astype(int) 
winners['milliseconds'].fillna(0, inplace=True) 

winners['lap_millis'] = winners['milliseconds_y'] -  winners['milliseconds']

winners = winners.loc[winners['raceId']==952] #950
winners.loc[winners['pit']==1].head()
winners.loc[winners['flag']==0].head()

## ML:

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [None]:
'''getting X and y'''
tempdf = winners[['positionOrder', 'totalmilli', 'to_in_front', 'to_behind', 'avgLap', 'flag', 'pit']].dropna()
X = tempdf[['positionOrder', 'totalmilli', 'to_in_front', 'to_behind', 'avgLap', 'flag']].values
y = tempdf[['pit']].values

'''Splitting whole sample into train and test:'''
train_X, test_X, train_y, test_y = train_test_split(X, y, 
                                                    train_size=0.8,
                                                    test_size=0.2,
                                                    random_state=123,
                                                    stratify=y)
tree = DecisionTreeClassifier()
tree.fit(train_X, train_y)

'''Predict:'''
pred_y = tree.predict(test_X)

'''Evaluate:'''
# using the score function in each model class
print("accuracy on the test set", tree.score(test_X, test_y))
print("accuracy on the training set", tree.score(train_X, train_y))

# using single metric functions in the sklearn.metrics package 
print("accuracy on the test set", accuracy_score(pred_y, test_y))

In [None]:
tempdf['pit'].value_counts()

In [None]:

leader = winners.loc[winners['position_y']==1]
leader.head()
leader = leader.rename(index=str, columns={"totalmilli": "lead_milli"})
winners = winners.merge(leader[['lap','lead_milli']], on='lap')
winners[['totalmilli','lead_milli']].head()

winners['behind'] = (winners['totalmilli'] -winners['lead_milli'])  #.astype(dt.timedelta)
winners[['behind','totalmilli','lead_milli']].head()
winners.head()


In [None]:
# Let's build our plot
import matplotlib.pyplot as plt
%matplotlib inline
# needed for jupyter notebooks
    
fig = plt.figure()
ax1 = fig.add_subplot(211)
ax3 = fig.add_subplot(212)

#fig, ax1 = plt.subplots()
ax2 = ax1.twinx()  # set up the 2nd axis

car = winners.loc[winners['positionOrder']==1]
color = 'blue'

ret = ax1.bar(data=car.loc[car['pit']==1], x="lap", height="milliseconds", width=1, alpha=0.2, color=color) #plot the Revenue on axis #1
# the next few lines plot the fiscal year data as bar plots and changes the color for each.
#ax2.bar(data=car,  x="lap", height="milliseconds_y",width=2, alpha=0.2, color='blue')
ax2.scatter(data=car, x='lap', y = 'lap_millis',color=color, alpha=0.2)
ax3.scatter(x='lap', y = 'behind', data=car,color=color, alpha=0.2)
ax3.scatter(x='lap', y = 'to_in_front',s=2, data=car, color=color, alpha=0.3)
ax3.scatter(x='lap', y = 'to_behind',s=2, data=car, color=color, alpha=0.3)

car = winners.loc[winners['positionOrder']==2]
color = 'red'

ret = ax1.bar(data=car.loc[car['pit']==1], x="lap", height="milliseconds", width=1, alpha=0.2, color=color) #plot the Revenue on axis #1
# the next few lines plot the fiscal year data as bar plots and changes the color for each.
#ax2.bar(data=car,  x="lap", height="milliseconds_y",width=2, alpha=0.2, color='blue')
ax2.scatter(data=car, x='lap', y = 'lap_millis',color=color, alpha=0.2)
ax3.scatter(x='lap', y = 'behind', data=car,color=color, alpha=0.2)
ax3.scatter(x='lap', y = 'to_in_front',s=2, data=car, color=color, alpha=0.3)
ax3.scatter(x='lap', y = 'to_behind',s=2, data=car, color=color, alpha=0.3)


ret = ax2.bar(data=car.loc[car['flag']==0], x="lap", height='milliseconds_y', width=1, alpha=0.4, color='yellow') #plot the Revenue on axis #1


ax2.grid(b=False) # turn off grid #2
ax1.set_ylim(8000,35000)
ax2.set_ylim(75000,160000)

ax1.set_title('Title')
ax1.set_ylabel('ylabel')
ax2.set_ylabel('y2Label')
 
#ax3.plot(data=car, x='lap', y = 'position_y',color='red', alpha=0.2)
    
# Set the x-axis labels to be more meaningful than just some random dates.
#labels = ['FY 2010', 'FY 2011','FY 2012', 'FY 2013','FY 2014', 'FY 2015']
#ax1.axes.set_xticklabels(labels)


In [None]:
winners['hue'] = winners['position_y'] #/4 
import pylab as plt


plt.scatter(data=winners, x='lap', y = 'milliseconds_y')
#plt.scatter(X,Y2,color='g')
plt.show()

In [None]:

sns.set(style="whitegrid")
#sns.set(rc={'figure.figsize':(6,7)})
#g = sns.FacetGrid(winners.head(1000), col="raceId", aspect=1 ,col_wrap=5, height=5, hue='pit')
#g = g.map(sns.scatterplot, "lap", "milliseconds_y", s=100)

#g = g.map(sns.scatterplot, "year", "lap", s=150)
#g = g.map(sns.violinplot, 'year','lap',hue="positionText" )
#ax = sns.violinplot(x="year", y="lap",  data=main, height = 50)

#sns.scatterplot(data=winners.loc[winners['raceId']==21].loc[winners['pit']==1],  x="lap", y="milliseconds_y", s=550,hue = 'pit')
#sns.scatterplot(data=winners.loc[winners['raceId']==960].loc[winners['pit']==1],  x="lap", y="milliseconds_y", s=100,hue = 'pit')
ret = sns.scatterplot(data=winners.loc[winners['raceId']==964],  x="lap", y="milliseconds_y",style='position_x', s=100,hue = 'hue')

#sns.scatterplot(data=winners.loc[winners['pit']==1],  x="raceId", y="lap", s=100,hue = 'pit')


#palette="Set2"

#sns.pairplot(main, hue="position")


In [None]:
main = results.copy()
race_circuit = races[['raceId','circuitId','year']]
main = main.merge(race_circuit, on='raceId')
main = main.merge(pitstops, on=['raceId','driverId'])

main = main.merge(avgLaps, on=['raceId','lap']).reset_index()
main.head()

#main = main.merge(laptimes2, on=['raceId','driverId'])
#winners.head()

main = main.loc[main['positionOrder'] < 8]
main = main.loc[main['year'] > 2011]
main = main.loc[main['circuitId'] == 4]
main = main.loc[main['milliseconds_y'] < 60000 * 10]
#main.head()
main['x'] = main['year'] + (main['position']/10)
#main['pitflag'] = main['pit'] + (main['flag']*2)

main = main[['positionText','position','circuitId','lap','year','milliseconds_y','x','flag']]
#main = main.loc[main['positionText'] > 0]
main = main[main.positionText.apply(lambda x: x.isnumeric())]
main.head()



In [None]:
#sns.set(style="ticks")
sns.set(style="whitegrid")
sns.set(rc={'figure.figsize':(14,6)})
#g = sns.FacetGrid(main, col="year", aspect=0.3 ,col_wrap=5, height=5, hue="positionText")
#g = g.map(sns.scatterplot,  "positionText", "lap", s=150, size='milliseconds_y')

#g = g.map(sns.scatterplot, "year", "lap", s=150)
#g = g.map(sns.violinplot, 'year','lap',hue="positionText" )
#ax = sns.violinplot(x="year", y="lap",  data=main, height = 50)

plt = sns.scatterplot(data=main,  x="x", y="lap", s=550,palette="Set2",hue = 'flag', size = 'milliseconds_y')

#sns.pairplot(main, hue="position")
