## Data Exploration on Accident Data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('accident_data.csv', sep=';', encoding='ISO-8859-1')
df.head(5)

Unnamed: 0,OBJECTID,LAND,BEZ,LOR,STRASSE,UJAHR,UMONAT,USTUNDE,UWOCHENTAG,UKATEGORIE,...,IstPKW,IstFuss,IstKrad,IstGkfz,IstSonstige,USTRZUSTAND,LINREFX,LINREFY,XGCSWGS84,YGCSWGS84
0,49090,11,12,12301203,Wittenau Süd,2019,1,13,6,3,...,1,0,0,0,0,1,7940622837,5835083823,1334146,5258609
1,49091,11,3,3040818,Pankow Süd,2019,1,9,5,3,...,1,0,0,0,0,0,7991304007,5832327415,1341356,5255862
2,49093,11,12,12103115,Breitkopfbecken,2019,3,21,6,3,...,0,0,0,0,0,0,795437613,5833549454,1336034,5257159
3,49096,11,6,6040703,Nikolassee,2019,1,7,6,2,...,1,1,0,0,0,1,7867143754,5817042137,1321777,5242825
4,49097,11,7,7030303,Grazer Platz,2019,2,15,3,3,...,1,0,0,0,0,0,7960743342,5822724905,1336007,5247421


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13390 entries, 0 to 13389
Data columns (total 24 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   OBJECTID     13390 non-null  int64 
 1   LAND         13390 non-null  int64 
 2   BEZ          13390 non-null  int64 
 3   LOR          13390 non-null  int64 
 4   STRASSE      13390 non-null  object
 5   UJAHR        13390 non-null  int64 
 6   UMONAT       13390 non-null  int64 
 7   USTUNDE      13390 non-null  int64 
 8   UWOCHENTAG   13390 non-null  int64 
 9   UKATEGORIE   13390 non-null  int64 
 10  UART         13390 non-null  int64 
 11  UTYP1        13390 non-null  int64 
 12  ULICHTVERH   13390 non-null  int64 
 13  IstRad       13390 non-null  int64 
 14  IstPKW       13390 non-null  int64 
 15  IstFuss      13390 non-null  int64 
 16  IstKrad      13390 non-null  int64 
 17  IstGkfz      13390 non-null  int64 
 18  IstSonstige  13390 non-null  object
 19  USTRZUSTAND  13390 non-nu

In [4]:
df = df.drop(columns=['OBJECTID', 'BEZ', 'STRASSE', 'LAND',	'LOR',	'UJAHR', 'USTUNDE', 'UWOCHENTAG', 'UKATEGORIE',
                    'UART', 'UTYP1', 'ULICHTVERH','IstSonstige','LINREFX','LINREFY','XGCSWGS84','YGCSWGS84'])

In [5]:
df.rename(columns={'UMONAT':'month', 'IstRad':'bicycle', 'IstPKW':'car', 'IstFuss':'pedestrian', 'IstKrad':'motorbike',               
                    'IstGkfz':'truck', 'USTRZUSTAND':'road condition'},inplace=True)

In [6]:
df.head(5)

Unnamed: 0,month,bicycle,car,pedestrian,motorbike,truck,road condition
0,1,1,1,0,0,0,1
1,1,1,1,0,0,0,0
2,3,1,0,0,0,0,0
3,1,0,1,1,0,0,1
4,2,0,1,0,0,0,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13390 entries, 0 to 13389
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   month           13390 non-null  int64 
 1   bicycle         13390 non-null  int64 
 2   car             13390 non-null  int64 
 3   pedestrian      13390 non-null  int64 
 4   motorbike       13390 non-null  int64 
 5   truck           13390 non-null  int64 
 6   road condition  13390 non-null  object
dtypes: int64(6), object(1)
memory usage: 732.4+ KB


### <span style='color: blue'> Changing month and road condition column type from int to str</span>

In [8]:
df[['month', 'road condition']] = df[['month','road condition']].astype(str)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13390 entries, 0 to 13389
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   month           13390 non-null  object
 1   bicycle         13390 non-null  int64 
 2   car             13390 non-null  int64 
 3   pedestrian      13390 non-null  int64 
 4   motorbike       13390 non-null  int64 
 5   truck           13390 non-null  int64 
 6   road condition  13390 non-null  object
dtypes: int64(5), object(2)
memory usage: 732.4+ KB


#### Changin month and road condition columns from numerical values to actual values

In [10]:
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
df['month'] = df['month'].replace(['1','2','3','4','5','6','7','8','9','10','11','12'],months)
df['month'] = pd.Categorical(df['month'], categories=months, ordered=True)

In [11]:
df['road condition'] = df['road condition'].replace(['0','1','2'],['dry','wet','icy'])

In [12]:
df.head(5)

Unnamed: 0,month,bicycle,car,pedestrian,motorbike,truck,road condition
0,Jan,1,1,0,0,0,wet
1,Jan,1,1,0,0,0,dry
2,Mar,1,0,0,0,0,dry
3,Jan,0,1,1,0,0,wet
4,Feb,0,1,0,0,0,dry


In [13]:
df.columns

Index(['month', 'bicycle', 'car', 'pedestrian', 'motorbike', 'truck',
       'road condition'],
      dtype='object')

In [14]:
for col in df.columns:
    print(f'{col}: {df[col].value_counts()}\n')

month: month
Jun    1448
Aug    1371
Sep    1270
Oct    1214
May    1207
Jul    1175
Apr    1104
Nov    1081
Dec     936
Mar     907
Jan     891
Feb     786
Name: count, dtype: int64

bicycle: bicycle
0    8385
1    5005
Name: count, dtype: int64

car: car
1    10908
0     2482
Name: count, dtype: int64

pedestrian: pedestrian
0    11422
1     1968
Name: count, dtype: int64

motorbike: motorbike
0    11416
1     1974
Name: count, dtype: int64

truck: truck
0    12925
1      465
Name: count, dtype: int64

road condition: road condition
dry                        10154
wet                         3190
icy                           45
Hellersdorfer Promenade        1
Name: count, dtype: int64



### <span style='color: red'> road condition has one unknown value, removing it: </span>

In [15]:
df.drop(
    index = df
    .index[df['road condition'] == 'Hellersdorfer Promenade']
    .to_list(),
    inplace=True)

In [16]:
df['road condition'].unique()

array(['wet', 'dry', 'icy'], dtype=object)