In [1]:
import pandas as pd
import os

In [2]:
# steps to reduce original csv size from 1GB to less than 100MB

In [3]:
# read original file
df = pd.read_csv('../assets/US_Accidents_Dec21_updated.csv')
df.shape

(2845342, 47)

In [4]:
# creating a data subset
small_df = df[['Severity', 
               'Start_Time', 
               'City', 
               'State', 
               'Visibility(mi)', 
               'Wind_Speed(mph)', 
               'Precipitation(in)', 
               'Weather_Condition', 
               'Sunrise_Sunset']].copy()
small_df.head()

Unnamed: 0,Severity,Start_Time,City,State,Visibility(mi),Wind_Speed(mph),Precipitation(in),Weather_Condition,Sunrise_Sunset
0,3,2016-02-08 00:37:08,Dublin,OH,10.0,10.4,0.0,Light Rain,Night
1,2,2016-02-08 05:56:20,Dayton,OH,10.0,,0.02,Light Rain,Night
2,2,2016-02-08 06:15:39,Cincinnati,OH,10.0,,0.02,Overcast,Night
3,2,2016-02-08 06:51:45,Akron,OH,10.0,,,Overcast,Night
4,3,2016-02-08 07:53:43,Cincinnati,OH,10.0,10.4,0.01,Light Rain,Day


In [5]:
# unique sunrise sunset
small_df.Sunrise_Sunset.unique()

array(['Night', 'Day', nan], dtype=object)

In [6]:
# transform value to D (Day) and N (Night) to reduce size
def transformNightDay(value):
    if value == 'Night':
        return 'N'
    elif value == 'Day':
        return 'D'
    else:
        return value

small_df['Sunrise_Sunset'] = small_df['Sunrise_Sunset'].apply(transformNightDay)
small_df.Sunrise_Sunset.unique()

array(['N', 'D', nan], dtype=object)

In [7]:
# adding year column
def getYear(startTime):
    return int(startTime[0:4])

small_df['Year'] = small_df['Start_Time'].apply(getYear)

# printing unique years
small_df.Year.unique()

array([2016, 2017, 2021, 2020, 2018, 2019])

In [8]:
# removing start_time column

small_df.drop(columns=['Start_Time'], inplace=True)
print(list(small_df.columns))

['Severity', 'City', 'State', 'Visibility(mi)', 'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Sunrise_Sunset', 'Year']


In [9]:
# removing years to reduce csv size
print(small_df.shape)
small_df = small_df[small_df['Year'] > 2019]
print(small_df.shape)
small_df.Year.unique()

(2845342, 9)
(2137609, 9)


array([2021, 2020])

In [10]:
# saving reduced csv
small_df.to_csv('../assets/US_Accidents_Dec21_small.csv', index=False)
os.path.getsize('../assets/US_Accidents_Dec21_small.csv')

90811054