 US Accident Exploratory Data Analysis 
 

In [None]:
import numpy as np
import pandas as pd
import opendatasets as od
import folium
from folium import plugins
from folium.plugins import HeatMap
#To download the dataset via URL the following two lines of code will download into a local directory
# opendatasets package is needed for this -- pip install opendatasets --upgrade -- run this in your preferred env 
#You will need a user name and API key from Kaggle for this to work as well. Your API token can be generated from the MyAccount page

# download_url = 'https://www.kaggle.com/datasets/sobhanmoosavi/us-accidents'
# od.download(download_url)

In [None]:
#Setting the file as a variable for later use
data_filename = './us-accidents/US_Accidents_Dec21_updated.csv'

## Data Preparation and Cleaning TODO 
1. Load the file uing Pandas
2. Look at and understand some information about the data & the columns
3. Fix any missing or incorrect values
4. Discuss that New York is not contained Data Set

In [None]:
df = pd.read_csv(data_filename)

In [None]:
#Get an idea of how many columns there are and some of the data they have inside of them
df.info()

In [None]:
#Checking the stats of the data to create some questions about the data.
df.describe()

# Ask & Answer Questions

1. Are there more accidents in warmer or colder areas?
2. Which states have the highest number of accidents - Per capita?
3. How frequently does precipitaion appear in accidents
4. Among the top 100 cities in number of accidents, which states do they belong to most frequently.
5. What time of the day are accidents frequently occouring 
6. Which Days of the week have the most accidents
7. Which Months have the most accidents
8. What is the trend of accidents Year over Year 

In [None]:
#Checking for the number of numeric columns 
numerics = ['int16','int32','int64','float16','float32','float64']
numeric_df = df.select_dtypes(include=numerics)
print(len(df.columns))
print(len(numeric_df.columns))


# Percentage of missing values per column 

In [None]:
#Checking for NA values, if the values is PRESENT in the df it will return FALSE, if it is empty, it will return TRUE
#To count these, simply run the sum function

list_of_sum_na = df.isna().sum().sort_values(ascending=False)
missing_percentage_of_na = list_of_sum_na / len(df)
missing_percentage_of_na_zeropulled = missing_percentage_of_na[missing_percentage_of_na != 0]


print(list_of_sum_na)
missing_percentage_of_na_zeropulled.plot(kind='barh')

In [None]:
# A higher number on the bar indicates a higher missing percentage so Street is not frequently missing while number is frequently missing a value
# If it is misisng more than half of the time from the data it will not be very helpful for the analysis
# I am going to go ahead and delete the columns with less relevance, to run this fresh you will need to un comment

# del df['Number']


# Exploratory Analysis and Visualization 

Columns to analyze :
1. City
2. Start Time
3. Start Lat, Start Long
4. Temperature 
5. Weather Condition 

In [None]:
df.columns

### City

In [None]:
df.City
unique_cities = df.City.unique()
print(len(unique_cities))


In [None]:
cities_by_accident = df.City.value_counts()
#Check for the unique occurances for accident per city | Top 20
cities_by_accident[:20]
#I noticed that New york is not in here, which seems a bit strange to me
#"New York" in df.City
#Return False
#"NY" in df.State
#Return False
#Data is not being collected on NY for whatever reason






In [None]:
cities_by_accident[:20].plot(kind='barh')

In [None]:
#Check the distrubution of accidents per city using a distplot 
import seaborn as sns
sns.set_style("darkgrid")
sns.distplot(cities_by_accident)

In [None]:
# This distrubution plot shows us that most of cities don't really have many accidents, but the ones that do have more than the norm has exceedingly more than the norm. It would be helpful to seperate these 

high_accident_cities = cities_by_accident[cities_by_accident >= 1000]
low_accident_cities = cities_by_accident[cities_by_accident < 1000]

print(len(high_accident_cities))
print(len(low_accident_cities))

# The percentage of accidents per city 

percentage_of_high_from_whole = len(high_accident_cities) / len(cities_by_accident)
print(percentage_of_high_from_whole)

In [None]:
sns.distplot(high_accident_cities)


In [None]:
sns.histplot(high_accident_cities, log_scale = True)

In [None]:
sns.distplot(low_accident_cities)


In [None]:
sns.histplot(low_accident_cities, log_scale = True)

In [None]:
# It seems a little strange that the log shows that the data is a little scewed. I am going to check the values for 1, because there may be some problem with the data
values_of_one = cities_by_accident[cities_by_accident == 1]
print(values_of_one)
# 1110 values only have one accident logged which seems like it might not be totally relevant for high level analysis 

# Analysis of accident time


### Monthly Analysis

In [None]:

df.Start_Time = pd.to_datetime(df.Start_Time)
sns.distplot(df.Start_Time.dt.month, bins = 12,kde=False, norm_hist=True)




In [None]:
# to give some representation to the actual count of accidents that have been happening on a monthly basis I will draw another graph
df.Start_Time.dt.month.value_counts().plot(kind='bar')

1. Although there is some missing data for January, I think that this trend is fairly accurate
2. Traveling holidays (Thanksgiving & Chrismtas) tend to affect the month wise trend of accidnets
3. The trend starts moving up starting in September (Labor Day) and then continues moving up
4. I would like to see the sources so that I could compare distrubutions. External source may be needed.
5. The bar graph shows a clear trend by number.



### Weekly analysis 

In [None]:
#Checking to see which hours of the day have the most accidents
df.Start_Time
# The data is in a strange format. I will format it to be in DateTime
df.Start_Time = pd.to_datetime(df.Start_Time)
#Because Start_Time is a time stamp we will need to break it down into peices for it to be usable 
sns.distplot(df.Start_Time.dt.hour, bins = 24,kde=False, norm_hist=True)


-- We can see that a higher percentage of accidents happen between 3pm and 6pm | Assumptions can be made that it is due to to people in a hurry leaving work or picking up kids from school
-- The next highest percentage is between 6am and 9pm

In [None]:
#Checking to see which day of the week has the most accidents
sns.distplot(df.Start_Time.dt.day_of_week, bins = 7,kde=False, norm_hist=True)

-- It is evenly distributed during the weekdays with a slight increase on Friday.
-- The weekend however has a steep decline in accidents.
-- The assumption is that less people are traveling during the weekend, as they don't have to go to work.

### Hourly Analysis per day


In [None]:
#Sunday Distrubution 
sunday_accidents = df.Start_Time[df.Start_Time.dt.day_of_week == 6]
sns.distplot(sunday_accidents.dt.hour, kde=False, norm_hist=True, bins = 24)

In [None]:
#Monday Distrubution 
monday_accidents = df.Start_Time[df.Start_Time.dt.day_of_week == 0]
sns.distplot(monday_accidents.dt.hour, kde=False, norm_hist=True, bins = 24)

In [None]:
#Tuesday Distrubution 
tuesday_accidents = df.Start_Time[df.Start_Time.dt.day_of_week == 1]
sns.distplot(tuesday_accidents.dt.hour, kde=False, norm_hist=True, bins = 24)

In [None]:
#Wednesday Distrubution 
wednesday_accidents = df.Start_Time[df.Start_Time.dt.day_of_week == 2]
sns.distplot(wednesday_accidents.dt.hour, kde=False, norm_hist=True, bins = 24)

In [None]:
#Thursday Distrubution 
thursday_accidents = df.Start_Time[df.Start_Time.dt.day_of_week == 3]
sns.distplot(thursday_accidents.dt.hour, kde=False, norm_hist=True, bins = 24)

In [None]:
#Friday Distrubution 
friday_accidents = df.Start_Time[df.Start_Time.dt.day_of_week == 4]
sns.distplot(friday_accidents.dt.hour, kde=False, norm_hist=True, bins = 24)

In [None]:
#Saturday Distrubution 
saturday_accidents = df.Start_Time[df.Start_Time.dt.day_of_week == 5]
sns.distplot(saturday_accidents.dt.hour, kde=False, norm_hist=True, bins = 24)

# Analysis of geographic positioning

In [None]:
df.Start_Lat, df.Start_Lng
sns.scatterplot(x = df.Start_Lng, y =  df.Start_Lat, size= .001)
# Generally speaking there are very few parts of the US that DON'T experience traffic accidents
# I made the points smaller so that you could see the density a little better



In [None]:
lng_list = list(df.Start_Lng)
lat_list = list(df.Start_Lat)
paired_list = list(zip(lng_list,lat_list))


In [None]:
map = folium.Map()
folium.plugins.HeatMap(zip(list(df.Start_Lat), list(df.Start_Lng))).add_to(map)
map

In [None]:
#A heat map demonstrates this futher

## Summary and Conclusion 

Insights:
- No data about from New York 
- Low percent of cities have more than 1000 yearly accidents (Around 4 percent )
- Over 1100 Cities have reported only 1 accident
- The number of accidents per city decreases exponentially 
