In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
nyc_crime_raw = pd.read_csv('https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/master/New_York_offenses/NEW_YORK-Offenses_Known_to_Law_Enforcement_by_City_2013%20-%2013tbl8ny.csv')

In [4]:
nyc_crime_raw.head(5)

Unnamed: 0,Table 8,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,NEW YORK,,,,,,,,,,,,
1,Offenses Known to Law Enforcement,,,,,,,,,,,,
2,"by City, 2013",,,,,,,,,,,,
3,City,Population,Violent\ncrime,Murder and\nnonnegligent\nmanslaughter,Rape\n(revised\ndefinition)1,Rape\n(legacy\ndefinition)2,Robbery,Aggravated\nassault,Property\ncrime,Burglary,Larceny-\ntheft,Motor\nvehicle\ntheft,Arson3
4,Adams Village,1861,0,0,,0,0,0,12,2,10,0,0


In [5]:
#Looks like there was descriptive text at the top of the file and the actual columns are seen in row 3 above.
#First rename the columns

nyc_crime_raw.columns = ['City', 'Population', 'Violent Crime', 'Murder', 'Rape1', 'Rape2', 'Robbery', 
                     'Aggravated Assault', 'Property Crime', 'Burglary', 'Larceny Theft', 'Motor Vehicle Theft',
                    'Arson']

In [6]:
#Confirm the new columns are persisted

nyc_crime_raw.head(5)

Unnamed: 0,City,Population,Violent Crime,Murder,Rape1,Rape2,Robbery,Aggravated Assault,Property Crime,Burglary,Larceny Theft,Motor Vehicle Theft,Arson
0,NEW YORK,,,,,,,,,,,,
1,Offenses Known to Law Enforcement,,,,,,,,,,,,
2,"by City, 2013",,,,,,,,,,,,
3,City,Population,Violent\ncrime,Murder and\nnonnegligent\nmanslaughter,Rape\n(revised\ndefinition)1,Rape\n(legacy\ndefinition)2,Robbery,Aggravated\nassault,Property\ncrime,Burglary,Larceny-\ntheft,Motor\nvehicle\ntheft,Arson3
4,Adams Village,1861,0,0,,0,0,0,12,2,10,0,0


In [7]:
#Now drop superfluous rows

nyc_crime = nyc_crime_raw.iloc[4:]

In [8]:
#Confirm row drop worked

nyc_crime.head(5)

Unnamed: 0,City,Population,Violent Crime,Murder,Rape1,Rape2,Robbery,Aggravated Assault,Property Crime,Burglary,Larceny Theft,Motor Vehicle Theft,Arson
4,Adams Village,1861,0,0,,0,0,0,12,2,10,0,0.0
5,Addison Town and Village,2577,3,0,,0,0,3,24,3,20,1,0.0
6,Akron Village,2846,3,0,,0,0,3,16,1,15,0,0.0
7,Albany,97956,791,8,,30,227,526,4090,705,3243,142,
8,Albion Village,6388,23,0,,3,4,16,223,53,165,5,


In [35]:
#Describe the data

nyc_crime.describe()

Unnamed: 0,City,Population,Violent Crime,Murder,Rape1,Rape2,Robbery,Aggravated Assault,Property Crime,Burglary,Larceny Theft,Motor Vehicle Theft,Arson
count,351,348,348,348,0.0,348,348,348,348,348,348,348,187
unique,351,347,82,14,0.0,26,51,69,239,124,232,51,13
top,Rye Brook Village,9517,0,0,,0,0,0,0,0,0,0,0
freq,1,2,51,299,,204,139,66,6,21,7,93,133


In [36]:
nyc_crime.shape

(351, 13)

In [37]:
#Now that we know we have 351 observations across 13 columns, let's Find NaN's by column

missing_values_count = nyc_crime.isnull().sum()
print(missing_values_count)

City                     0
Population               3
Violent Crime            3
Murder                   3
Rape1                  351
Rape2                    3
Robbery                  3
Aggravated Assault       3
Property Crime           3
Burglary                 3
Larceny Theft            3
Motor Vehicle Theft      3
Arson                  164
dtype: int64


In [38]:
#Drop NaN's in the columns we care most about

nyc_crime = nyc_crime.dropna(subset=['Population', 'Murder', 'Robbery', 'Property Crime'])

In [39]:
#Rename 'Rape2' to just 'Rape'

#Disable warning when making changes on a slice of original dataframe
pd.options.mode.chained_assignment = None

nyc_crime.rename(columns={'Rape2': 'Rape'}, inplace=True)

In [40]:
#Drop Rape1 since it contains no data

nyc_crime.drop(columns=['Rape1'], inplace=True)

### Now that data is clean, begin creating/converting features for Population Squared, Murder and Robbery

In [53]:
#To create population squared feature, first remove commas from amounts
nyc_crime['Population'] = nyc_crime['Population'].str.replace(',', '')

In [55]:
#Next, convert column from string to int

nyc_crime['Population'] = nyc_crime['Population'].astype(int)

In [56]:
#Now we can create our new feature!

nyc_crime['Population Squared'] = nyc_crime['Population']**2

In [59]:
#Confirm this worked

nyc_crime[['Population', 'Population Squared']].head(5)

Unnamed: 0,Population,Population Squared
4,1861,3463321
5,2577,6640929
6,2846,8099716
7,97956,9595377936
8,6388,40806544


In [62]:
#To convert 'Murder' column, first need to convert from string to int

nyc_crime['Murder'] = nyc_crime['Murder'].astype(int)

In [63]:
#Now lets change our 'Murder' column to categorical (0 or 1)

nyc_crime['Murder'] = np.where((nyc_crime['Murder'] > 0), 1, 0)

In [67]:
#To create converted 'Robbery' column, need to remove commas first

nyc_crime['Robbery'] = nyc_crime['Robbery'].str.replace(',', '')

In [68]:
#Similarly to population and murder, to convert 'Robbery' column, first need to convert from string to int

nyc_crime['Robbery'] = nyc_crime['Robbery'].astype(int)

In [69]:
#Now lets change our 'Robbery' column to categorical (0 or 1)

nyc_crime['Robbery'] = np.where((nyc_crime['Robbery'] > 0), 1, 0)