In [1]:
# Dependencies
import pandas as pd
import numpy as np

In [2]:
# Name of the CSV file
file = 'Resources/2015_Portland_Crime_Data.csv'

In [3]:
# The correct encoding must be used to read the CSV in pandas
df_2015 = pd.read_csv(file, encoding="ISO-8859-1")

In [4]:
df_2015.head()

Unnamed: 0,Address,CaseNumber,CrimeAgainst,Neighborhood,OccurDate,OccurTime,OffenseCategory,OffenseType,OpenDataLat,OpenDataLon,OpenDataX,OpenDataY,ReportDate,OffenseCount
0,,15-X197430,Person,Piedmont,5/12/2015,1400,Assault Offenses,Intimidation,,,,,5/12/2015,1
1,,15-X4282999,Person,Buckman West,5/1/2015,2143,Assault Offenses,Simple Assault,,,,,5/1/2015,1
2,,15-X4283033,Person,University Park,5/1/2015,1625,Assault Offenses,Simple Assault,,,,,5/1/2015,1
3,,15-X4283218,Person,Madison South,5/1/2015,1820,Assault Offenses,Simple Assault,,,,,5/1/2015,1
4,,15-X4283218,Person,Madison South,5/1/2015,1820,Kidnapping/Abduction,Kidnapping/Abduction,,,,,5/1/2015,1


In [5]:
# Delete extraneous column
del df_2015['Address']
del df_2015['CaseNumber']
del df_2015['CrimeAgainst']
del df_2015['OpenDataLat']
del df_2015['OpenDataLon']
del df_2015['OpenDataX']
del df_2015['OpenDataY']
del df_2015['ReportDate']
del df_2015['OffenseCount']
df_2015.head()
# or create new dataframe without extraneous column

Unnamed: 0,Neighborhood,OccurDate,OccurTime,OffenseCategory,OffenseType
0,Piedmont,5/12/2015,1400,Assault Offenses,Intimidation
1,Buckman West,5/1/2015,2143,Assault Offenses,Simple Assault
2,University Park,5/1/2015,1625,Assault Offenses,Simple Assault
3,Madison South,5/1/2015,1820,Assault Offenses,Simple Assault
4,Madison South,5/1/2015,1820,Kidnapping/Abduction,Kidnapping/Abduction


In [6]:
# Identify incomplete rows
df_2015.count()

Neighborhood       36530
OccurDate          37672
OccurTime          37672
OffenseCategory    37672
OffenseType        37672
dtype: int64

In [7]:
df_2015.dtypes
#object is base level variable

Neighborhood       object
OccurDate          object
OccurTime           int64
OffenseCategory    object
OffenseType        object
dtype: object

In [8]:
# convert the 'OccurDate' column to datetime format 
df_2015['Date']= pd.to_datetime(df_2015['OccurDate']) 
  
# Check the format of 'Date' column 
df_2015.dtypes

Neighborhood               object
OccurDate                  object
OccurTime                   int64
OffenseCategory            object
OffenseType                object
Date               datetime64[ns]
dtype: object

In [9]:
df_2015.head()

Unnamed: 0,Neighborhood,OccurDate,OccurTime,OffenseCategory,OffenseType,Date
0,Piedmont,5/12/2015,1400,Assault Offenses,Intimidation,2015-05-12
1,Buckman West,5/1/2015,2143,Assault Offenses,Simple Assault,2015-05-01
2,University Park,5/1/2015,1625,Assault Offenses,Simple Assault,2015-05-01
3,Madison South,5/1/2015,1820,Assault Offenses,Simple Assault,2015-05-01
4,Madison South,5/1/2015,1820,Kidnapping/Abduction,Kidnapping/Abduction,2015-05-01


In [10]:
#filter out dates that are not year 2015
df_filter = df_2015[(df_2015['Date'].dt.year == 2015)]
df_filter

Unnamed: 0,Neighborhood,OccurDate,OccurTime,OffenseCategory,OffenseType,Date
0,Piedmont,5/12/2015,1400,Assault Offenses,Intimidation,2015-05-12
1,Buckman West,5/1/2015,2143,Assault Offenses,Simple Assault,2015-05-01
2,University Park,5/1/2015,1625,Assault Offenses,Simple Assault,2015-05-01
3,Madison South,5/1/2015,1820,Assault Offenses,Simple Assault,2015-05-01
4,Madison South,5/1/2015,1820,Kidnapping/Abduction,Kidnapping/Abduction,2015-05-01
...,...,...,...,...,...,...
37667,,7/28/2015,2319,Robbery,Robbery,2015-07-28
37668,,6/22/2015,1846,Assault Offenses,Simple Assault,2015-06-22
37669,,11/21/2015,232,Assault Offenses,Simple Assault,2015-11-21
37670,Goose Hollow,11/20/2015,11,Assault Offenses,Simple Assault,2015-11-20


In [11]:
#sort by date in ascending order
df_2015sort = df_filter.sort_values('Date')
df_2015sort

Unnamed: 0,Neighborhood,OccurDate,OccurTime,OffenseCategory,OffenseType,Date
5759,Downtown,1/1/2015,1700,Larceny Offenses,All Other Larceny,2015-01-01
23283,University Park,1/1/2015,0,Fraud Offenses,Identity Theft,2015-01-01
32318,Wilkes,1/1/2015,0,Larceny Offenses,All Other Larceny,2015-01-01
12678,Northwest,1/1/2015,1,Assault Offenses,Intimidation,2015-01-01
4376,Northwest,1/1/2015,1,Larceny Offenses,Shoplifting,2015-01-01
...,...,...,...,...,...,...
3539,Argay,12/31/2015,2015,Robbery,Robbery,2015-12-31
3540,Multnomah,12/31/2015,1200,Assault Offenses,Intimidation,2015-12-31
3543,Sabin,12/31/2015,2314,Assault Offenses,Simple Assault,2015-12-31
9796,Lloyd,12/31/2015,1555,Larceny Offenses,Shoplifting,2015-12-31


In [12]:
df_2015sort.count()

Neighborhood       36278
OccurDate          37374
OccurTime          37374
OffenseCategory    37374
OffenseType        37374
Date               37374
dtype: int64

In [13]:
# Drop all rows with any column with missing information
df_2015clean = df_2015sort.dropna(how='any')
#can also drop based on particular column
#reference https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html

In [14]:
# Verify dropped rows
df_2015clean.count()

Neighborhood       36278
OccurDate          36278
OccurTime          36278
OffenseCategory    36278
OffenseType        36278
Date               36278
dtype: int64

In [15]:
#Display an overview of the OffenseCategory column
df_2015clean['OffenseCategory'].value_counts()

Larceny Offenses                15907
Assault Offenses                 4508
Vandalism                        4034
Motor Vehicle Theft              2959
Fraud Offenses                   2826
Burglary                         2436
Drug/Narcotic Offenses           1296
Robbery                           582
Counterfeiting/Forgery            549
Weapon Law Violations             368
Sex Offenses                      293
Arson                             130
Prostitution Offenses             123
Stolen Property Offenses           89
Kidnapping/Abduction               44
Embezzlement                       40
Animal Cruelty Offenses            26
Homicide Offenses                  19
Sex Offenses, Nonforcible          16
Pornography/Obscene Material       12
Human Trafficking Offenses         10
Extortion/Blackmail                 7
Bribery                             4
Name: OffenseCategory, dtype: int64

In [16]:
#Display an overview of the OffenseType column
df_2015clean['OffenseType'].value_counts()

Theft From Motor Vehicle                       6543
All Other Larceny                              5095
Vandalism                                      4034
Motor Vehicle Theft                            2959
Shoplifting                                    2493
Burglary                                       2436
Simple Assault                                 2192
Identity Theft                                 1644
Intimidation                                   1336
Drug/Narcotic Violations                       1291
Theft From Building                            1040
Aggravated Assault                              980
False Pretenses/Swindle/Confidence Game         886
Robbery                                         582
Counterfeiting/Forgery                          549
Theft of Motor Vehicle Parts or Accessories     538
Weapons Law Violations                          368
Credit Card/ATM Fraud                           270
Rape                                            154
Arson       

In [17]:
#Display an overview of the Neighborhood column
df_2015clean['Neighborhood'].value_counts()

Hazelwood              2626
Downtown               2616
Lents                  1448
Powellhurst-Gilbert    1380
Old Town/Chinatown     1279
                       ... 
Woodland Park            19
Northwest Heights        14
Crestwood                13
Marshall Park             9
Healy Heights             4
Name: Neighborhood, Length: 96, dtype: int64