In [69]:
import pandas as pd
import folium
import json
from folium import plugins

In [70]:
with open('laMap.geojson') as f:
    laArea = json.load(f)

In [71]:
df = pd.read_csv("US_Accidents_Dec20_updated_LA.csv")

In [72]:
df.head()

Unnamed: 0,ID,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),Description,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-2721958,2,22/03/2016 19:14,23/03/2016 01:14,33.91133,-118.16819,33.91585,-118.1884,1.2,At I-710/Garfield Ave - Accident.,...,False,False,False,False,False,False,Night,Day,Day,Day
1,A-2721959,2,22/03/2016 19:43,23/03/2016 01:43,33.89238,-118.04408,33.89494,-118.04882,0.324,At Carmenita Rd/Exit 119 - Accident.,...,False,False,False,False,False,False,Night,Night,Day,Day
2,A-2721960,2,22/03/2016 19:36,23/03/2016 01:36,34.09256,-118.20622,34.09256,-118.20622,0.0,At Avenue 43 - Accident.,...,False,False,True,False,False,False,Night,Night,Day,Day
3,A-2721961,2,22/03/2016 20:12,23/03/2016 02:12,33.91619,-118.23915,33.91619,-118.23903,0.007,At Wilmington Ave - Accident.,...,False,False,False,False,True,False,Night,Night,Night,Day
4,A-2721962,2,22/03/2016 20:12,23/03/2016 02:12,33.91619,-118.23903,33.91619,-118.23915,0.007,At Wilmington Ave - Accident.,...,False,False,False,False,True,False,Night,Night,Night,Day


In [73]:
df.isna().any()

ID                       False
Severity                 False
Start_Time               False
End_Time                 False
Start_Lat                False
Start_Lng                False
End_Lat                  False
End_Lng                  False
Distance(mi)             False
Description              False
Number                    True
Street                   False
Side                     False
City                      True
County                   False
State                    False
Zipcode                  False
Country                  False
Timezone                 False
Airport_Code             False
Weather_Timestamp         True
Temperature(F)            True
Wind_Chill(F)             True
Humidity(%)               True
Pressure(in)              True
Visibility(mi)            True
Wind_Direction            True
Wind_Speed(mph)           True
Precipitation(in)         True
Weather_Condition         True
Amenity                  False
Bump                     False
Crossing

1. Clean the date data

In [74]:
df['clean_Start_Time'] = pd.to_datetime(df['Start_Time'], format = '%d/%m/%Y %H:%M', errors = 'coerce')
# Clean the start time

In [75]:
df['Year'] = df['clean_Start_Time'].dt.year

In [80]:
df['Year']

0        2016.0
1        2016.0
2        2016.0
3        2016.0
4        2016.0
          ...  
90715    2019.0
90716    2019.0
90717    2019.0
90718    2019.0
90719    2019.0
Name: Year, Length: 90720, dtype: float64

In [81]:
df.clean_Start_Time

0       2016-03-22 19:14:00
1       2016-03-22 19:43:00
2       2016-03-22 19:36:00
3       2016-03-22 20:12:00
4       2016-03-22 20:12:00
                ...        
90715   2019-12-10 16:58:00
90716   2019-12-10 16:53:00
90717   2019-12-10 16:57:00
90718   2019-12-10 16:56:00
90719   2019-12-10 17:02:00
Name: clean_Start_Time, Length: 90720, dtype: datetime64[ns]

In [82]:
df['clean_End_Time'] = pd.to_datetime(df['End_Time'], format = '%d/%m/%Y %H:%M', errors = 'coerce')

In [83]:
df.End_Time

0        23/03/2016 01:14
1        23/03/2016 01:43
2        23/03/2016 01:36
3        23/03/2016 02:12
4        23/03/2016 02:12
               ...       
90715    10/12/2019 18:10
90716    10/12/2019 17:48
90717    10/12/2019 17:50
90718    10/12/2019 17:27
90719    10/12/2019 17:55
Name: End_Time, Length: 90720, dtype: object

# 基本分析

In [84]:
len(df)

# How many cases are there in LA county from 2016-2020?

90720

In [85]:
df.Severity.value_counts()

# The number of traffic accidents in 4 different severity

2    85629
3     3805
4     1285
1        1
Name: Severity, dtype: int64

In [86]:
round(df["Distance(mi)"].mean(),2)

0.33

In [87]:
round(df["Distance(mi)"].max(),2)

137.62

In [88]:
round(df["Distance(mi)"].min(),2)

0.0

# 时间

The Average traffic delay due to road accidents

In [89]:
df['duration_time_hour'] = (df['clean_End_Time'] - df['clean_Start_Time']).dt.seconds/60/60

In [90]:
df['duration_time_hour'] = round(df['duration_time_hour'])

In [91]:
df['duration_time_hour']

0        6.0
1        6.0
2        6.0
3        6.0
4        6.0
        ... 
90715    1.0
90716    1.0
90717    1.0
90718    1.0
90719    1.0
Name: duration_time_hour, Length: 90720, dtype: float64

In [92]:
df.duration_time_hour.max()

24.0

In [93]:
df.duration_time_hour.min()

0.0

In [94]:
round(df.duration_time_hour.mean(), 2)

3.13

Road accidents percentage over past 5 years

In [95]:
Year_df = pd.DataFrame(df.clean_Start_Time.dt.year.value_counts()).reset_index().rename(columns={'index':'Year', 'clean_Start_Time':'Cases'}).sort_values(by='Cases', ascending=False)

In [96]:
Year_df['accident/day'] = round(Year_df['Cases']/365)

In [97]:
Year_df

Unnamed: 0,Year,Cases,accident/day
0,2020.0,63802,175.0
1,2016.0,11802,32.0
2,2017.0,8224,23.0
3,2019.0,4507,12.0


In [98]:
df.groupby(['Year', 'Severity']).size().unstack()

Severity,1,2,3,4
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016.0,,9923.0,1515.0,364.0
2017.0,,6930.0,1014.0,280.0
2019.0,,4447.0,26.0,34.0
2020.0,1.0,61947.0,1250.0,604.0


# 地点

Top ten cities with the highest cases in LA county

In [99]:
city_df = pd.DataFrame(df['City'].value_counts()).reset_index().rename(columns={'index':'City', 'City':'Cases'})

# The number of traffic accidents in different cities

In [100]:
city_df.head(10)

Unnamed: 0,City,Cases
0,Los Angeles,26699
1,Long Beach,3660
2,Whittier,2603
3,Baldwin Park,1930
4,Pomona,1787
5,Gardena,1644
6,Sherman Oaks,1575
7,Pasadena,1498
8,Downey,1382
9,Diamond Bar,1372


Top ten street with the highest cases in LA county

In [101]:
df.Street.value_counts().head(10).reset_index()

# The number of traffic accidents in different streets

Unnamed: 0,index,Street
0,I-10 W,3701
1,I-10 E,3700
2,I-405 N,3368
3,Golden State Fwy S,2574
4,I-405 S,2164
5,Foothill Fwy E,2036
6,Foothill Fwy W,2027
7,I-5 N,2022
8,I-605 S,1883
9,Golden State Fwy N,1870


根据交通事故引起堵塞的起始点描绘的点图

In [6]:
laMap = folium.Map(location=[34.0522,-118.2437], tiles='Stamen Toner', zoom_start=9)

In [53]:
folium.GeoJson(laArea).add_to(laMap)

<folium.features.GeoJson at 0x1f86f0a2fa0>

In [66]:
for i,row in df.iterrows():
    folium.CircleMarker((row.Start_Lat,row.Start_Lng), radius=1, weight=1, color='red', fill_color='red', fill_opacity=.5).add_to(laMap)

In [68]:
laMap.save('laPointMap_accident.html')

# 路况

In [109]:
df.Side.value_counts()

R    81714
L     9006
Name: Side, dtype: int64

In [107]:
df.Side.value_counts(normalize=True).round(4) * 100

R    90.07
L     9.93
Name: Side, dtype: float64

In [103]:
df["Weather_Condition"].dropna(how='any')

# cleaning the null data in "Weather_Condition"

0         Clear
1         Clear
2         Clear
3         Clear
4         Clear
          ...  
90715    Cloudy
90716    Cloudy
90717    Cloudy
90718    Cloudy
90719    Cloudy
Name: Weather_Condition, Length: 87688, dtype: object

In [36]:
df.Weather_Condition.value_counts()

Fair                       43178
Clear                      12136
Cloudy                      7669
Mostly Cloudy               5575
Partly Cloudy               5118
Light Rain                  3360
Haze                        3085
Overcast                    2297
Rain                        1548
Scattered Clouds             903
Heavy Rain                   730
Fog                          718
Smoke                        677
Fair / Windy                 308
Mist                          53
Partly Cloudy / Windy         51
Cloudy / Windy                31
Light Rain / Windy            31
Snow                          24
T-Storm                       20
Drizzle                       17
Thunder in the Vicinity       17
Heavy T-Storm                 16
Light Rain Showers            14
Rain Showers                  13
Rain / Windy                  11
Mostly Cloudy / Windy         10
Widespread Dust               10
Heavy Rain / Windy             9
Blowing Dust                   9
Light Rain

In [37]:
pd.value_counts(df.Weather_Condition, normalize=True).round(5) * 100

Fair                       49.240
Clear                      13.840
Cloudy                      8.746
Mostly Cloudy               6.358
Partly Cloudy               5.837
Light Rain                  3.832
Haze                        3.518
Overcast                    2.620
Rain                        1.765
Scattered Clouds            1.030
Heavy Rain                  0.832
Fog                         0.819
Smoke                       0.772
Fair / Windy                0.351
Mist                        0.060
Partly Cloudy / Windy       0.058
Cloudy / Windy              0.035
Light Rain / Windy          0.035
Snow                        0.027
T-Storm                     0.023
Drizzle                     0.019
Thunder in the Vicinity     0.019
Heavy T-Storm               0.018
Light Rain Showers          0.016
Rain Showers                0.015
Rain / Windy                0.013
Mostly Cloudy / Windy       0.011
Widespread Dust             0.011
Heavy Rain / Windy          0.010
Blowing Dust  

在不同湿度环境下发生车祸的概率（根据在不同湿度环境下发生车祸的次数）

In [38]:
df["Humidity(%)"].dropna(how='any')

# cleaning the null data in "Humidity(%)"

0        58.0
1        50.0
2        24.0
3        56.0
4        56.0
         ... 
90715    72.0
90716    70.0
90717    67.0
90718    67.0
90719    70.0
Name: Humidity(%), Length: 87473, dtype: float64

In [39]:
df["Humidity(%)"].value_counts()

83.0    2179
78.0    2041
75.0    2035
72.0    1985
80.0    1983
        ... 
98.0       4
3.0        3
99.0       2
91.0       2
95.0       1
Name: Humidity(%), Length: 98, dtype: int64

In [40]:
bins = [0,10,20,30,40,50,60,70,80,90,100]

In [41]:
df_Humidity = pd.cut(df["Humidity(%)"], bins)

In [42]:
pd.value_counts(df_Humidity)

(70, 80]     13590
(60, 70]     13546
(50, 60]     12473
(80, 90]     12044
(40, 50]      9618
(30, 40]      8198
(20, 30]      6579
(10, 20]      5964
(90, 100]     3586
(0, 10]       1875
Name: Humidity(%), dtype: int64

In [43]:
df_Humidity_scope = pd.value_counts(df_Humidity, normalize=True).round(2) * 100

In [44]:
df_Humidity_scope

(70, 80]     16.0
(60, 70]     15.0
(50, 60]     14.0
(80, 90]     14.0
(40, 50]     11.0
(30, 40]      9.0
(20, 30]      8.0
(10, 20]      7.0
(90, 100]     4.0
(0, 10]       2.0
Name: Humidity(%), dtype: float64

In [45]:
df_Humidity_scope.reset_index().rename(columns={'index':'Humidity_scope', 'Humidity(%)':'Cases%'}).sort_values(by=['Humidity_scope'])

Unnamed: 0,Humidity_scope,Cases%
9,"(0, 10]",2.0
7,"(10, 20]",7.0
6,"(20, 30]",8.0
5,"(30, 40]",9.0
4,"(40, 50]",11.0
2,"(50, 60]",14.0
1,"(60, 70]",15.0
0,"(70, 80]",16.0
3,"(80, 90]",14.0
8,"(90, 100]",4.0


在不同能见度下发生车祸的几率

In [46]:
df["Visibility(mi)"].dropna(how='any')

# cleaning the null data in "Humidity(%)"

0        10.0
1        10.0
2        10.0
3        10.0
4        10.0
         ... 
90715    10.0
90716    10.0
90717    10.0
90718    10.0
90719    10.0
Name: Visibility(mi), Length: 87583, dtype: float64

In [47]:
df["Visibility(mi)"].agg(['min', 'max', 'median', 'mean']).round(2).reset_index()

Unnamed: 0,index,Visibility(mi)
0,min,0.0
1,max,25.0
2,median,10.0
3,mean,9.0


In [48]:
bins = [0,1,2,3,4,5,6,7,8,9,10,25]

In [49]:
df_Visibility = pd.cut(df["Visibility(mi)"], bins)

In [50]:
pd.value_counts(df_Visibility)

(9, 10]     67014
(8, 9]       3366
(6, 7]       3285
(7, 8]       2949
(5, 6]       2336
(4, 5]       2211
(3, 4]       1809
(1, 2]       1713
(2, 3]       1610
(0, 1]       1248
(10, 25]       13
Name: Visibility(mi), dtype: int64

In [51]:
df_Visibility_scope = pd.value_counts(df_Visibility, normalize=True).round(4) * 100

In [52]:
df_Visibility_scope.reset_index().rename(columns={'index':'Visibility_scope', 'Visibility(mi)':'Cases%'}).sort_values(by=['Visibility_scope'])

Unnamed: 0,Visibility_scope,Cases%
9,"(0, 1]",1.43
7,"(1, 2]",1.96
8,"(2, 3]",1.84
6,"(3, 4]",2.07
5,"(4, 5]",2.53
4,"(5, 6]",2.67
2,"(6, 7]",3.75
3,"(7, 8]",3.37
1,"(8, 9]",3.84
0,"(9, 10]",76.54
