In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.cluster import KMeans 

In [None]:
data = pd.read_csv('../input/Train.csv', parse_dates=['datetime'])
data.head()

In [None]:
data.tail()

In [None]:
ss = pd.read_csv('../input/SampleSubmission.csv', parse_dates=['date'])
print(ss.shape)
display(ss.tail())

In [None]:
bins = [0, 3, 6, 9, 12, 15, 18, 21, 24]
labels_ = ['00:00:00 - 02:59:59', '03:00:00 - 05:59:59', 
          '06:00:00 - 08:59:59', '09:00:00 - 11:59:59',
          '12:00:00 - 14:59:59', '15:00:00 - 17:59:59',
          '18:00:00 - 20:59:59', '21:00:00 - 23:59:59']

labels = ['0-3', '3-6', '6-9', '9-12', '12-15', '15-18', '18-21', '21-24']

data['time_bin'] = pd.cut(data.datetime.dt.hour, bins, labels=labels, right=False)
display(data.head(10))

In [None]:
ss['time_bin'] = pd.cut(ss.date.dt.hour, bins, labels=labels, right=False)
display(ss.head(5))

In [None]:
data['day'] = data['datetime'].dt.day_name()
display(data.tail())

In [None]:
ss['day'] = ss['date'].dt.day_name()
display(ss.tail())

In [None]:
accidents_per_day = data['day'].value_counts(normalize=True)
print(accidents_per_day)

In [None]:
accidents_per_day.plot(kind='bar', figsize=(12, 6), title='Total Accidents per day')
plt.xlabel('Day of the week')
plt.ylabel('Number of accidents')
plt.show()

In [None]:
accidents_per_time_interval = data['time_bin'].value_counts(normalize=True)

accidents_per_time_interval.plot(kind='bar', figsize=(12, 6), title='Total Accidents per three hour interval')
plt.xlabel('Time of the day')
plt.ylabel('Number of accidents')
plt.show()

In [None]:
accidents_on_tuesday = data[(data['day'] == 'Tuesday')]
accidents_on_tuesday_count = accidents_on_tuesday['time_bin'].value_counts(normalize=True)
accidents_on_tuesday_count.plot(kind='bar', figsize=(12, 6), title='Accidents on Tuesdays')
plt.xlabel('Time of the day')
plt.ylabel('Number of accidents')
plt.show()

In [None]:
accidents_on_wednesday = data[(data['day'] == 'Wednesday')]
accidents_on_wednesday_count = accidents_on_wednesday['time_bin'].value_counts(normalize=True)
accidents_on_wednesday_count.plot(kind='bar', figsize=(12, 6), title='Accidents on Wednesdays')
plt.xlabel('Time of the day')
plt.ylabel('Number of accidents')
plt.show()

In [None]:
accidents_on_tuesday = accidents_on_tuesday[['longitude', 'latitude']]

ax = plt.gca()
accidents_on_tuesday.plot(x='longitude', y='latitude', kind='scatter', figsize=(12, 12), alpha=0.3, title='Crash Locations', color='r', ax=ax)
plt.show()

In [None]:
kmeans = KMeans( n_clusters=6, init='k-means++', algorithm='full')
#data_ = data[['longitude', 'latitude']]
y_kmeans = kmeans.fit_predict(accidents_on_tuesday)
ax = plt.gca()
accidents_on_tuesday[y_kmeans==0].plot(x='longitude', y='latitude', kind='scatter', 
                     figsize=(12, 12), alpha=0.3, title='Crash Locations', color='green', ax=ax)
accidents_on_tuesday[y_kmeans==1].plot(x='longitude', y='latitude', kind='scatter', 
                     figsize=(12, 12), alpha=0.3, title='Crash Locations', color='blue', ax=ax)
accidents_on_tuesday[y_kmeans==2].plot(x='longitude', y='latitude', kind='scatter', 
                     figsize=(12, 12), alpha=0.3, title='Crash Locations', color='red', ax=ax)
accidents_on_tuesday[y_kmeans==3].plot(x='longitude', y='latitude', kind='scatter', 
                     figsize=(12, 12), alpha=0.3, title='Crash Locations', color='brown', ax=ax)
accidents_on_tuesday[y_kmeans==4].plot(x='longitude', y='latitude', kind='scatter', 
                     figsize=(12, 12), alpha=0.3, title='Crash Locations', color='cyan', ax=ax)
accidents_on_tuesday[y_kmeans==5].plot(x='longitude', y='latitude', kind='scatter', 
                     figsize=(12, 12), alpha=0.3, title='Crash Locations', color='magenta', ax=ax)

plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=50, c='black', label = 'Centroids')
plt.show()

In [None]:
day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
bins = labels

In [None]:
cluster_data = []

for idx, day in enumerate(day_names):
    for idy, time_bin in enumerate(bins):
        data_selected = data[(
            data['day'] == day_names[idx]) & 
            (data['time_bin'] == bins[idy]) &
            (data['latitude'] >= -2.0 ) & 
            (data['longitude'] <= 37.4)
        ]
        kmeans = KMeans(n_clusters=6, init ='k-means++', max_iter=300, random_state=42, algorithm='full')
        kmeans.fit(data_selected[['latitude', 'longitude']])
        #cluster_data.append([day, bin_, *kmeans.cluster_centers_])
        cluster_data.append([day, time_bin, *(np.concatenate(kmeans.cluster_centers_).flatten())])
        
cluster_df = pd.DataFrame(cluster_data)
cluster_df.columns = ['Day', 'time_bin',
                      'A0_Latitude', 'A0_Longitude', 
                      'A1_Latitude', 'A1_Longitude', 
                      'A2_Latitude', 'A2_Longitude', 
                      'A3_Latitude', 'A3_Longitude', 
                      'A4_Latitude', 'A4_Longitude', 
                      'A5_Latitude', 'A5_Longitude' ]

new_df.to_csv('submission_final_1.csv', index=False)

In [None]:
ss_new = ss[['date', 'time_bin', 'day']]
display(ss_new)

In [None]:
new_df = pd.merge(ss_new, cluster_df,  how='left', left_on=['day', 'time_bin'], right_on = ['Day', 'time_bin'])
new_df = new_df.drop(columns=['time_bin', 'day', 'Day'])
display(new_df.head())  

new_df.to_csv('submission_final_1.csv', index=False)