In [17]:
config = configparser.ConfigParser()
config.read('credentials.cfg')
os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']

In [1]:
import os
import boto3
import pandas as pd
import sys
from io import StringIO

## Read Datasets

In [2]:
def get_data_from_s3(foldername, filename):
    client = boto3.client('s3')
    bucket_name = 'spark-project-kolusu'
    object_key = "/".join(["input",foldername,filename])
    csv_obj = client.get_object(Bucket=bucket_name, Key=object_key)
    body = csv_obj['Body']
    csv_string = body.read().decode('utf-8')
    df = pd.read_csv(StringIO(csv_string), low_memory=False)
    return df

In [3]:
df_Crashes = get_data_from_s3("Crashes","Crashes.csv")
df_Vehicles = get_data_from_s3("Vehicles","Vehicles.csv")
df_Persons = get_data_from_s3("Persons","Persons.csv")

## Explore Datasets

### Check the number of records in each table

In [5]:
print("The number of records in collisions data set is %d" %len(df_Crashes))

The number of records in collisions data set is 1827692


In [7]:
print("The number of records in vehicles data set is %d" %len(df_Vehicles))

The number of records in vehicles data set is 3664394


In [8]:
print("The number of records in persons data set is %d" %len(df_Persons))

The number of records in persons data set is 4452223


### Get the summary of each table

In [10]:
df_Crashes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1827692 entries, 0 to 1827691
Data columns (total 29 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   crash_date                     object 
 1   crash_time                     object 
 2   on_street_name                 object 
 3   number_of_persons_injured      float64
 4   number_of_persons_killed       float64
 5   number_of_pedestrians_injured  int64  
 6   number_of_pedestrians_killed   int64  
 7   number_of_cyclist_injured      int64  
 8   number_of_cyclist_killed       int64  
 9   number_of_motorist_injured     int64  
 10  number_of_motorist_killed      int64  
 11  contributing_factor_vehicle_1  object 
 12  contributing_factor_vehicle_2  object 
 13  collision_id                   int64  
 14  vehicle_type_code1             object 
 15  vehicle_type_code2             object 
 16  borough                        object 
 17  zip_code                       object 
 18  la

In [11]:
df_Vehicles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3664394 entries, 0 to 3664393
Data columns (total 25 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   unique_id                    int64  
 1   collision_id                 int64  
 2   crash_date                   object 
 3   crash_time                   object 
 4   vehicle_id                   object 
 5   state_registration           object 
 6   vehicle_type                 object 
 7   contributing_factor_1        object 
 8   vehicle_make                 object 
 9   vehicle_year                 float64
 10  travel_direction             object 
 11  vehicle_occupants            float64
 12  driver_sex                   object 
 13  driver_license_status        object 
 14  driver_license_jurisdiction  object 
 15  pre_crash                    object 
 16  point_of_impact              object 
 17  vehicle_damage               object 
 18  public_property_damage       object 
 19  

In [12]:
df_Persons.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4452223 entries, 0 to 4452222
Data columns (total 21 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   unique_id              int64  
 1   collision_id           int64  
 2   crash_date             object 
 3   crash_time             object 
 4   person_id              object 
 5   person_type            object 
 6   person_injury          object 
 7   vehicle_id             float64
 8   ped_role               object 
 9   person_sex             object 
 10  person_age             float64
 11  ejection               object 
 12  emotional_status       object 
 13  bodily_injury          object 
 14  position_in_vehicle    object 
 15  safety_equipment       object 
 16  complaint              object 
 17  ped_location           object 
 18  ped_action             object 
 19  contributing_factor_1  object 
 20  contributing_factor_2  object 
dtypes: float64(2), int64(2), object(17)
memory usage: 713.

### Check for Missing Values

In [13]:
df_Crashes.isnull().any()

crash_date                       False
crash_time                       False
on_street_name                    True
number_of_persons_injured         True
number_of_persons_killed          True
number_of_pedestrians_injured    False
number_of_pedestrians_killed     False
number_of_cyclist_injured        False
number_of_cyclist_killed         False
number_of_motorist_injured       False
number_of_motorist_killed        False
contributing_factor_vehicle_1     True
contributing_factor_vehicle_2     True
collision_id                     False
vehicle_type_code1                True
vehicle_type_code2                True
borough                           True
zip_code                          True
latitude                          True
longitude                         True
location                          True
cross_street_name                 True
off_street_name                   True
contributing_factor_vehicle_3     True
contributing_factor_vehicle_4     True
vehicle_type_code_3      

In [14]:
df_Vehicles.isnull().any()

unique_id                      False
collision_id                   False
crash_date                     False
crash_time                     False
vehicle_id                     False
state_registration              True
vehicle_type                    True
contributing_factor_1           True
vehicle_make                    True
vehicle_year                    True
travel_direction                True
vehicle_occupants               True
driver_sex                      True
driver_license_status           True
driver_license_jurisdiction     True
pre_crash                       True
point_of_impact                 True
vehicle_damage                  True
public_property_damage          True
contributing_factor_2           True
vehicle_damage_1                True
vehicle_damage_2                True
vehicle_damage_3                True
vehicle_model                   True
public_property_damage_type     True
dtype: bool

In [15]:
df_Persons.isnull().any()

unique_id                False
collision_id             False
crash_date               False
crash_time               False
person_id                 True
person_type              False
person_injury            False
vehicle_id                True
ped_role                  True
person_sex                True
person_age                True
ejection                  True
emotional_status          True
bodily_injury             True
position_in_vehicle       True
safety_equipment          True
complaint                 True
ped_location              True
ped_action                True
contributing_factor_1     True
contributing_factor_2     True
dtype: bool

In [16]:
df_Persons["person_id"].isnull().sum()

19

In [17]:
df_Persons[df_Persons["person_id"].isnull()]

Unnamed: 0,unique_id,collision_id,crash_date,crash_time,person_id,person_type,person_injury,vehicle_id,ped_role,person_sex,...,ejection,emotional_status,bodily_injury,position_in_vehicle,safety_equipment,complaint,ped_location,ped_action,contributing_factor_1,contributing_factor_2
49026,5505096,3867504,2018-03-22T00:00:00.000,16:50,,Pedestrian,Injured,,Pedestrian,M,...,,Conscious,Elbow-Lower-Arm-Hand,,,Fracture - Dislocation,Pedestrian/Bicyclist/Other Pedestrian Not at I...,Other Actions in Roadway,,
720721,5238969,3904727,2018-05-19T00:00:00.000,19:59,,Occupant,Injured,16692615.0,Driver,M,...,Not Ejected,Conscious,Elbow-Lower-Arm-Hand,Driver,Lap Belt & Harness,Fracture - Dislocation,,,,
948660,5270154,3884772,2018-04-19T00:00:00.000,12:20,,Occupant,Unspecified,16707972.0,Driver,M,...,Not Ejected,Does Not Apply,Does Not Apply,Driver,Lap Belt & Harness,Does Not Apply,,,,
1083898,5231134,3902330,2018-05-11T00:00:00.000,2:20,,Occupant,Unspecified,16688838.0,Driver,M,...,Not Ejected,Does Not Apply,Does Not Apply,Driver,Lap Belt & Harness,Does Not Apply,,,,
1205662,5245854,3905895,2018-05-22T00:00:00.000,4:50,,Occupant,Injured,16696011.0,Driver,M,...,Not Ejected,Conscious,Elbow-Lower-Arm-Hand,Driver,Lap Belt & Harness,Contusion - Bruise,,,,
1387279,5505067,3875537,2018-04-05T00:00:00.000,14:48,,Occupant,Unspecified,16823614.0,Passenger,M,...,Not Ejected,Does Not Apply,Does Not Apply,"Front passenger, if two or more persons, inclu...",Lap Belt & Harness,Does Not Apply,,,,
1695412,5235837,3903420,2018-05-18T00:00:00.000,9:38,,Occupant,Unspecified,16691111.0,Passenger,M,...,Not Ejected,Does Not Apply,Does Not Apply,"Front passenger, if two or more persons, inclu...",Lap Belt & Harness,Does Not Apply,,,,
1885224,5267622,3871996,2018-03-29T00:00:00.000,11:15,,Occupant,Unspecified,16706732.0,Driver,M,...,Not Ejected,Does Not Apply,Does Not Apply,Driver,Lap Belt & Harness,Does Not Apply,,,,
2054551,5272967,3887315,2018-04-24T00:00:00.000,14:00,,Occupant,Unspecified,16709418.0,Passenger,M,...,Not Ejected,Does Not Apply,Does Not Apply,"Front passenger, if two or more persons, inclu...",Lap Belt,Does Not Apply,,,,
2105902,5249473,3906358,2018-05-23T00:00:00.000,13:20,,Occupant,Unspecified,16697814.0,Driver,M,...,Not Ejected,Does Not Apply,Does Not Apply,Driver,Lap Belt & Harness,Does Not Apply,,,,


In [19]:
df_Persons.duplicated().any()

False

### Conclusion

#### we can see that there are no duplicate records in any of the tables and collisions and vehicles tables do not have any null values in any of the unique identifier columns. However, there are some null values in person id in Persons table. This column will be imputed with appropriate value data transformation script