In [47]:
import pandas as pd
pd.set_option('display.max_columns', 500)
import csv

databases = ['crime_open_database_core_2018.csv', 'crime_open_database_core_2017.csv']

crimes_df = pd.DataFrame()

for db in databases:
    temp = pd.read_csv("national_crime_data/" + db)
    crimes_df = pd.concat([crimes_df, temp], ignore_index=True)
    #print(db + " -> " + str(len(crimes_df)))
    
crimes_df.columns

Index(['uid', 'city_name', 'offense_code', 'offense_type', 'offense_group',
       'offense_against', 'date_single', 'longitude', 'latitude',
       'location_type', 'location_category', 'census_block', 'date_start',
       'date_end'],
      dtype='object')

In [None]:
######################################################################
# Credit for this code goes to Sarah Okamoto, modified by Prathik Rao
######################################################################

# downloaded Virginia dataframe from tiger:
# https://www2.census.gov/geo/tiger/TIGER2019/TRACT/

# https://www2.census.gov/geo/tiger/TIGER2019/TRACT/tl_2019_01_tract.zip

# import geopandas as gpd
# from shapely.geometry import Point

import urllib
import json
import requests

# create new dataframe with fips_code_col column
# start with empty column
# final_df = raw_df.assign(fips_code_col = [])

# create empty dictionary mapping (lat, lon) to fips code
state_lookup = {}

bad_lat_lons = set()
# loop through all rows in csv

# stores corresponding fips codes in a list so that we can just add them as a new column afterwards
state_codes = []
for ind, row in crimes_df.iterrows():
    # From dataset, get lat lon from columns for each row
    lat = row["latitude"]
    lon = row["longitude"]
    lookup_key = f"({lat}, {lon})"
    # Check if (lat, lon) is in dictionary
    if lookup_key in state_lookup:
        # use that stored value if so
        state_code = state_lookup[lookup_key]
    else:
        # call API to get FIPS code from lat, lon
        # note: point may be using lon, lat instead of lat, lon
        point_request = f"https://geo.fcc.gov/api/census/block/find?latitude={lat}&longitude={lon}&showall=false&format=json"
        with urllib.request.urlopen(point_request) as url:
            data = json.loads(url.read().decode())
        # get everything except last 4 items from fips block code
        if data["State"]["code"] is None:
            print(f"something went wrong with ({lat}, {lon})")
            state_code = None
            bad_lat_lons.add((lat, lon, row["agyaddr"]))
        else:
            state_code = data["State"]["code"]
        state_lookup[lookup_key] = state_code
    # append state_code to end of csv
    state_codes.append(state_code)


print("bad lat lons:")
print(bad_lat_lons)
bad_lat_lon_df = pd.DataFrame(bad_lat_lons, columns=["latitude", "longitude"])
bad_lat_lon_df.to_csv("bad_lat_lons.csv")

# stores state codes in raw dataframe
crimes_df['state_code'] = state_code

# drop rows that aren't filled with a state value
# crimes_df = crimes_df.dropna(subset=['state_code'])

crimes_df.head()

In [None]:
temp = crimes_df[['city_name','offense_against']]
criminal_offenses = temp[temp.offense_against == "persons"]
civil_offenses = temp[temp.offense_against == "property"]
other_offenses = temp[temp.offense_against.isin(["society", "other"])]
other_offenses.head()

In [None]:
criminal_offenses_count_by_city = criminal_offenses.groupby('city_name').count()
criminal_offenses_count_by_city.rename(columns = {'offense_against':'city_level_criminal_offense_count'}, inplace = True) 
civil_offenses_count_by_city = civil_offenses.groupby('city_name').count()
civil_offenses_count_by_city.rename(columns = {'offense_against':'city_level_civil_offense_count'}, inplace = True) 
other_offenses_count_by_city = other_offenses.groupby('city_name').count()
other_offenses_count_by_city.rename(columns = {'offense_against':'city_level_other_offense_count'}, inplace = True) 


crime_stats = pd.merge(criminal_offenses_count_by_city, civil_offenses_count_by_city, on='city_name')
crime_stats = pd.merge(crime_stats, other_offenses_count_by_city, on='city_name')
crime_stats

In [None]:
df = pd.read_csv("BISTRA_GROUP_PROJECT_SMALL.csv")
df.head()

In [None]:
print(df.City.unique())
print(crime_stats.index)

In [None]:
merged = pd.merge(df, crime_stats, left_on='City', right_on='city_name')
merged.shape

In [None]:
# helper functions for displaying table data

import numpy as np
from IPython.display import display_html

# n is the number of columns to display data in
def display_side_by_side(series_obj, n):
    df = pd.DataFrame(series_obj)
    partition = int(round(len(df) / n))
    lower_bound = 0
    upper_bound = partition
    args = []
    for i in range(n):
        args.append(df[lower_bound:upper_bound])
        lower_bound += partition
        upper_bound += partition
    helper(args)

def helper(args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)
    
# replace all -999 with NaN inplace
merged.replace(to_replace = -999, value = np.nan, inplace=True)

# calculate percentage of NaNs in each column
percent_missing = merged.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': merged.columns,'percent_missing': percent_missing})

display_side_by_side(missing_value_df, 3)

In [None]:
merged[['City']]