# Getting Started: Tweet2Map Dataset
Hello and welcome to my dataset! This notebook  will visualize the data and also provide some code to get you started with your analysis. I will cover the following things:

- Plot the timeseries
- Visualize distribution
- Plotting points on a map


**Important**. As of January 8, 2020, there is a gap in the data where there is zero data input. MMDA was still tweeting, however my data miner was not running due to pandemic difficulties.

# Import Packages and Download Data from Kaggle

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
import os
import datetime
import folium
from folium.plugins import HeatMap, MarkerCluster

# Set plot style
plt.style.use('ggplot')
# Set the default figure size
plt.rc('figure', figsize=(12,6))

# Load and Prepare Data

In [None]:
# Load CSV file and get a quick preview
df = pd.read_csv(r'../input/mmda-traffic-incident-data/data_mmda_traffic_spatial.csv')
df.head()

In [None]:
# Cast appropriate dtypes to columns
df['Timestamp'] = df['Date'] + ' ' + df['Time']
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
df['High_Accuracy'] = df['High_Accuracy'].astype(int)
df.replace('None', np.nan, inplace=True)
df.head()

# Plot Timeseries
Plotting the timeseries data will involve creating a rolling average.

In [None]:
# Get time series of dates
df_date = df.groupby(pd.Grouper(key='Timestamp', freq="D")).count()

df_date = df_date.reset_index()
df_date.drop(['Date','Time','Location','Latitude','Longitude','Direction','Lanes_Blocked',
              'Involved','Tweet','Type'], axis=1,inplace=True)
df_date = df_date.rename(columns={'Source': 'y'})
# df_date.to_csv('data\incidents_day.csv',index=False)

# Create rolling average
df_date_rolling = df_date
df_date_rolling.set_index('Timestamp',inplace=True)
df_date_rolling['roll7_avg'] = df_date_rolling['y'].rolling(7).mean()
df_date_rolling['roll7_std'] = df_date_rolling['y'].rolling(7).std()
df_date.reset_index(inplace=True)
df_date_rolling = df_date_rolling.set_index('Timestamp')

# Plot
sns.set_style('darkgrid')
df_date_rolling['y'].plot(alpha=0.15,color='black')
df_date_rolling['roll7_avg'].plot()
df_date_rolling['roll7_std'].plot()
plt.xlabel('Date')
plt.ylabel('Amount of Incidents')
L = plt.legend(frameon=True)
L.set_title('Legend')
# L = plt.legend(frameon=True)
frame = L.get_frame()
frame.set_color('white')
L.get_texts()[0].set_text('Amount of Incidents')
L.get_texts()[1].set_text('Rolling Average of Incidents (7 Days)')
L.get_texts()[2].set_text('Rolling Standard Deviation (7 Days)')
plt.title('7 Day Rolling Average of Daily Incidents')
# plt.savefig('analysis/fig1_entiredata_rollingavg.jpg',bbox_inches = 'tight')

### Timeseries of Direction
The `direction` column means whether or not the accident occured in a road that was going northbound, eastbound, westbound, or southbound.

In [None]:
def subset_by_direction(df, direction):
    df_subset = df.dropna(subset=['Direction'])
    df_subset = df_subset[df_subset['Direction'].str.contains(direction)]
    df_subset = df_subset['Timestamp'].dt.hour.value_counts()
    df_subset = df_subset.rename_axis('Hour').reset_index(name=f'Amount_{direction}')
    df_subset = df_subset.sort_values(by='Hour')
    df_subset['Hour'] = df_subset['Hour'].astype(int)
    df_subset = df_subset.reset_index(drop=True)
    df_subset.set_index('Hour',inplace=True)
    return df_subset

df_sb = subset_by_direction(df, 'SB')
df_nb = subset_by_direction(df, 'NB')
df_eb = subset_by_direction(df, 'EB')
df_wb = subset_by_direction(df, 'WB')

df_direction = pd.concat([df_nb, df_sb, df_eb, df_wb],axis=1)

# Average amount of incidents per direction
df_direction['Amount_NB'] = df_direction['Amount_NB'] / len(df_direction)
df_direction['Amount_SB'] = df_direction['Amount_SB'] / len(df_direction)
df_direction['Amount_EB'] = df_direction['Amount_EB'] / len(df_direction)
df_direction['Amount_WB'] = df_direction['Amount_WB'] / len(df_direction)

# Plot
df_direction.plot()
plt.xticks(np.arange(0, 24, step=1))
plt.title('Average Amount of Incidents in the Day According to Direction')
L = plt.legend(frameon=True)
L.set_title('Legend')
# L = plt.legend(frameon=True)
frame = L.get_frame()
frame.set_color('white')
L.get_texts()[0].set_text('Northbound')
L.get_texts()[1].set_text('Southbound')
L.get_texts()[2].set_text('Eastbound')
L.get_texts()[3].set_text('Westbound')
plt.xlabel('Time in 24h Format')
plt.ylabel('Amount of Incidents Per Hour')
# plt.savefig('analysis/direction_avg.png',bbox_inches = 'tight')

### Timeseries of Accident Type
Generate timeseries plots for different types of data.

Mechanical breakdowns by participants

In [None]:
df_type = df
df_type = df_type.dropna()

def subset_mechanical_bus(row):
    if ('BUS' in row['Involved']) and ('MECHANICAL' in row['Type']):
        return True

def subset_mechanical_truck(row):
    if ('TRUCK' in row['Involved']) and ('MECHANICAL' in row['Type']):
        return True
    
def subset_mechanical_car(row):
    if (('CAR' in row['Involved']) or ('SUV' in row['Type'])) and ('MECHANICAL' in row['Type']):
        return True
    
def subset_mechanical_public(row):
    if (('PUJ' in row['Involved']) or ('UV' in row['Type'])) and ('MECHANICAL' in row['Type']):
        return True

# Subset
df_type['type_bus'] = df_type.apply(lambda row: subset_mechanical_bus(row),axis=1)
df_type['type_truck'] = df_type.apply(lambda row: subset_mechanical_truck(row),axis=1)
df_type['type_public'] = df_type.apply(lambda row: subset_mechanical_public(row),axis=1)
df_type['type_car'] = df_type.apply(lambda row: subset_mechanical_car(row),axis=1)

# Create dataframe and plot
df_type_participant = df_type[['Timestamp','type_bus','type_truck','type_public','type_car']]
df_type_participant.groupby(pd.Grouper(key='Timestamp', freq="W")).sum().rolling(7).mean().plot()

# Attach legend
L = plt.legend(frameon=True)
L.set_title('Legend')
frame = L.get_frame()
frame.set_color('white')
L.get_texts()[0].set_text('Public Bus')
L.get_texts()[1].set_text('Truck')
L.get_texts()[2].set_text('Other Public Transport')
L.get_texts()[3].set_text('Private Car')

# Labels
plt.xlabel('Date')
plt.ylabel('Amount of Incidents')
plt.title('Mechanical Breakdowns by Vehicle Type (Rolling Weekly Average)')
# plt.savefig('analysis/type_breakdown.png',bbox_inches = 'tight')

In [None]:
# Participant timeseries
df_type = df
df_type = df_type.dropna()

# Prepare lambda functions
def subset_bus(row):
    if ('BUS' in row['Involved']) and ('ACCIDENT' in row['Type']):
        return True

def subset_truck(row):
    if ('TRUCK' in row['Involved']) and ('ACCIDENT' in row['Type']):
        return True
    
def subset_private(row):
    if (('CAR' in row['Involved']) or ('SUV' in row['Involved'])) and ('ACCIDENT' in row['Type']):
        return True
    
def subset_public(row):
    if (('PUJ' in row['Involved']) or ('UV' in row['Involved'])) and ('ACCIDENT' in row['Type']):
        return True
    
def subset_pedestrian(row):
    if ('PEDESTRIAN' in row['Involved']) and ('ACCIDENT' in row['Type']):
        return True
    
def subset_motorcycle(row):
     if ('MOTORCYCLE' in row['Involved']) and ('ACCIDENT' in row['Type']):
        return True

# Apply to check for existance of participants
df_type['type_bus'] = df_type.apply(lambda row: subset_bus(row),axis=1)
df_type['type_truck'] = df_type.apply(lambda row: subset_truck(row),axis=1)
df_type['type_public'] = df_type.apply(lambda row: subset_public(row),axis=1)
df_type['type_private'] = df_type.apply(lambda row: subset_private(row),axis=1)
df_type['type_pedestrian'] = df_type.apply(lambda row: subset_pedestrian(row),axis=1)
df_type['type_motorcycle'] = df_type.apply(lambda row: subset_motorcycle(row),axis=1)

# Create dataframe and plot
cols = ['Timestamp','type_bus','type_truck','type_public','type_private', 'type_pedestrian', 'type_motorcycle']
df_type_participant = df_type[cols]
df_type_participant.groupby(pd.Grouper(key='Timestamp', freq="W")).sum().rolling(7).mean().plot()

# Attach legend
L = plt.legend(frameon=True)
L.set_title('Legend')
frame = L.get_frame()
frame.set_color('white')
L.get_texts()[0].set_text('Public Bus')
L.get_texts()[1].set_text('Truck')
L.get_texts()[2].set_text('Other Public Transport')
L.get_texts()[3].set_text('Private Car')
L.get_texts()[4].set_text('Pedestrian')
L.get_texts()[5].set_text('Motorcycle')

# Labels
plt.xlabel('Date')
plt.ylabel('Amount of Incidents')
plt.title('Number of Accidents By Participants (Rolling Weekly Average)')
# plt.savefig('analysis/type_breakdown.png',bbox_inches = 'tight')

# Visualize Distributions

In [None]:
# Get average incidents per month day
# df_date.set_index('ds',inplace=True)
df_date['day'] = df_date['Timestamp'].dt.day
df_date_avg = df_date.groupby('day').mean()
df_date_avg = df_date_avg.reset_index()
df_date_avg.head()

# Create bar plot based on the date time series dataframe
sns.set_style('darkgrid')
df_date['day'] = df_date['Timestamp'].dt.day
sns.barplot(x='day', y='y', data=df_date, palette='Blues_d', saturation=.75)
plt.title('Average Amounts of MMDA ALERT Incidents During the Month')
plt.xlabel('Day of Month')
plt.ylabel('Amount')
#plt.savefig('analysis/monthday_avg.jpg',bbox_inches = 'tight')

In [None]:
# Get time series of hour
df_hour = df.groupby(pd.Grouper(key='Timestamp', freq='H')).count()
df_hour = df_hour.reset_index()
df_hour = df_hour.drop(['Date','Time','Location','Latitude','Longitude','Direction','Type',
                        'Lanes_Blocked','Involved','Tweet'],axis=1)
df_hour = df_hour.rename(columns={'Source': 'y'})
# df_hour.to_csv('data\incidents_hour.csv',index=False)
df_hour.head()

df_cumulative_hour = df['Timestamp'].dt.hour.value_counts()
df_cumulative_hour = df_cumulative_hour.rename_axis('Hour').reset_index(name='Amount')
df_cumulative_hour = df_cumulative_hour.sort_values(by='Hour')
df_cumulative_hour['Hour'] = df_cumulative_hour['Hour'].astype(int)
df_cumulative_hour = df_cumulative_hour.reset_index(drop=True)

#df_in['bin_hour'].plot.hist(alpha=0.5,bins=24)
sns.set_style("darkgrid")
sns.barplot(x="Hour", y="Amount", data=df_cumulative_hour,palette='Blues_d',saturation=.75, ci=95)

plt.title('Time Distribution of MMDA ALERT Incidents')
plt.xlabel('Time in 24h Format')
#plt.savefig('analysis/hourcount.jpg',bbox_inches = 'tight')

In [None]:
# Create bar plot based on the date time series dataframe
sns.set_style('darkgrid')
df_date['day'] = df_date['Timestamp'].dt.day
sns.barplot(x='day', y='y', data=df_date,saturation=.75, palette='Blues_d', ci=95)
plt.title('Average Amounts of MMDA ALERT Incidents During the Month')
plt.xlabel('Day of Month')
plt.ylabel('Amount')
#plt.savefig('analysis/monthday_avg.jpg',bbox_inches = 'tight')

# Plot Accidents on a Map

In [None]:
# Init folium map
mc = MarkerCluster(name='Incidents')
metro_coords = (14.599574, 121.059929)
m = folium.Map(
    location=metro_coords,
    zoom_start=11,
    tiles='OpenStreetMap'
)

# load incident data
mc = MarkerCluster(name='Incidents')

# Populate map
for item in df.iterrows():

    source = item[1]['Source']
    text = item[1]['Tweet']
    timestamp = item[1]['Date']
    embed = """
    <div>
        <blockquote class="twitter-tweet tw-align-center"><p lang="en" dir="ltr"> {} </a></p>&mdash; Official MMDA (@MMDA)<a href="{}"> {}</a></blockquote>
        <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
    </div>
    """.format(text, source, timestamp)

    # Generate content for markers
    iframe = folium.IFrame(
        embed,
        width=500,
        height=280
    )

    # Put content in popup for markers
    popup = folium.Popup(iframe)
    mc.add_child(folium.Marker(location=[item[1]['Latitude'], item[1]['Longitude']],
                                popup=popup,
                                clustered_marker=True)).add_to(m)
    
folium.LayerControl(position='topright').add_to(m)
m