Final Presentation Notebook

Datasets:

https://data.wprdc.org/dataset/neighborhoods-with-snap-data/resource/bce22c26-9d3e-4e3f-8405-a35c4b7765b6

https://data.wprdc.org/dataset/arrest-data/resource/e03a89dd-134a-4ee8-a2bd-62c40aeebc6f

https://data.wprdc.org/dataset/pbp-fire-arm-seizures/resource/e967381d-d7e9-48e3-a2a2-39262f7fa5c4

Our process of finding the best neighborhood involved finding the ones that are the safest. We used datasets that had information including arrest information, crime and police reports, and gun seizures. Each neighborhood has a count of the things we are measuring, and we combined them all into one metric using all 3 datasets.

In [10]:
#Data from the first dataset, "Neighborhoods with SNAP data"
import pandas as pd
from math import isnan
snap = pd.read_csv('snapdata.csv')
seizures = pd.read_csv('firearm_seizures.csv')
arrest = pd.read_csv('arrestdata.csv')

neighborhoodCrimes = {}
neighborhoodReports = {}

#Calculating the number of serious crimes committed in each neighborhood using the SNAP data

for index,row in snap.iterrows():
    name = row["Neighborhood_2010_HOOD"]
    murders = row["SNAP_All_csv__Murder__2010_"]
    rape = row["SNAP_All_csv__Rape__2010_"]
    robbery = row["SNAP_All_csv__Robbery__2010_"]
    aggAsslt = row["F_Agr__Assault__2010_"]
    burglary = row["SNAP_All_csv__Burglary__2010_"]
    autoTheft = row["SNAP_All_csv__Auto_Theft__2010_"]

#Crimes being calculated /\
    
    crimes = murders + rape + robbery + aggAsslt + burglary + autoTheft
    
    if name in neighborhoodCrimes:
        neighborhoodCrimes[name] += crimes
    else:
        neighborhoodCrimes[name] = crimes

#Crimes being reported in each neighborhood
    
    policeReports = row["SNAP_All_csv__Part_1__Major_Cri"]
    policeReports2 = row["SNAP_All_csv__Part_2_Reports__2"]
    policeReports3 = row["SNAP_All_csv__Other_Police_Repo"]
    
    totalReports = policeReports + policeReports2 + policeReports3

    if name in neighborhoodReports:
        neighborhoodReports[name] += totalReports
    else:
        neighborhoodReports[name] = totalReports

#Combining the two datasets into one total incidents dictionary
totalIncidents = {}
for neighborhood in neighborhoodReports:
    if neighborhood in totalIncidents:
        totalIncidents[neighborhood] += neighborhoodReports[neighborhood] + neighborhoodCrimes[neighborhood]
    else:
        totalIncidents[neighborhood] = neighborhoodReports[neighborhood] + neighborhoodCrimes[neighborhood]

#Getting maximum number of crimes to use as a "worst case" so that we can compare neighborhoods
worst = totalIncidents[max(totalIncidents, key=totalIncidents.get)]

#Using our total incidents dictionary and worst neighborhood variable to put each neighborhood on a scale, with 0 being the least incidents and 100 being the most
bestNeighborhood = {}
for neighborhood in totalIncidents:
    if neighborhood not in bestNeighborhood:
        val = round((totalIncidents[neighborhood]) / worst * 100, 2)
        bestNeighborhood[neighborhood] = val

#Sorting from least crime to most
bestNeighborhood = dict(sorted(bestNeighborhood.items(), key=lambda item: item[1]))

source1 = {}
for i in bestNeighborhood:
    if i not in source1:
        val = round(100 - bestNeighborhood[i], 2)
        source1[i] = val 

In [None]:
#Data from the second dataset "Firearms Seizure Data"
neighborhoods = {}

#counting number of seizures per neighborhoods
for index,row in seizures.iterrows():
    name = row["neighborhood"]
    total = row["total_count"]

    if name in neighborhoods:
        neighborhoods[name] += total
    else:
        neighborhoods[name] = total

#removing the values that didn't have a neighborhood specified in the csv
hoods = filter(lambda k: not pd.isna(k), neighborhoods)
hoods = {k: neighborhoods[k] for k in neighborhoods if not pd.isna(k)}

#finding the most seizures in a neighborhoods
mostSeizures = hoods[max(hoods, key=hoods.get)]

#sorting from least seizures to most
sortedHoods = dict(sorted(hoods.items(), key=lambda item: item[1]))

weights = {}
for hood in sortedHoods:
    if hood not in weights:
        val = round((sortedHoods[hood]) / mostSeizures * 100, 2)
        weights[hood] = val

source2 = {}
for i in weights:
    if i not in source2:
        val = 100 - weights[i]
        source2[i] = val

In [9]:
#Data from the third dataset, "Arrest Data"
arrestHoods = {}

for index,row in arrest.iterrows():
    name = row["INCIDENTNEIGHBORHOOD"]

    if name in arrestHoods:
        arrestHoods[name] += 1
    else:
        arrestHoods[name] = 1
mostArrests = arrestHoods[max(arrestHoods, key=arrestHoods.get)]
sortedArrests = dict(sorted(arrestHoods.items(), key=lambda item: item[1]))

source3 = {}
for hood in sortedArrests:
    if hood not in source3:
        val = round((sortedArrests[hood] / mostArrests) * 100, 2)
        val2 = round(100 - val, 2)
        source3[hood] = val2

{'Mt. Oliver Neighborhood': 99.95, 'Troy Hill-Herrs Island': 99.86, 'Mt. Oliver Boro': 99.59, 'Central Northside': 99.48, 'Regent Square': 99.17, 'Ridgemont': 99.17, 'New Homestead': 99.12, 'Swisshelm Park': 99.03, 'Chartiers City': 98.96, 'East Carnegie': 98.92, 'St. Clair': 98.76, 'Outside County': 98.72, 'Outside State': 98.49, 'Summer Hill': 98.29, 'Oakwood': 98.15, 'Golden Triangle/Civic Arena': 98.13, 'Hays': 97.3, 'Mount Oliver': 97.21, 'Windgap': 97.21, 'Fairywood': 97.14, 'Allegheny West': 97.07, 'Arlington Heights': 96.51, 'Polish Hill': 96.26, 'Glen Hazel': 96.19, 'Friendship': 96.08, 'Morningside': 95.77, 'Upper Lawrenceville': 95.74, 'Duquesne Heights': 95.7, 'Esplen': 95.5, 'Spring Garden': 95.07, 'Stanton Heights': 94.98, 'Westwood': 94.91, 'Lincoln Place': 94.89, 'Bon Air': 93.99, 'Lower Lawrenceville': 93.87, 'Point Breeze North': 93.49, 'West End': 93.47, 'Squirrel Hill North': 93.47, 'Banksville': 93.45, 'Terrace Village': 93.31, 'Point Breeze': 93.22, 'South Shore':

In [15]:
#Combining all our data into one metric

allData = {}

#Giving the data from our first dataset a weight of 40% since we feel it's very important
for neighborhood in source1:
    val = round(source1[neighborhood] * .4, 2)
    if neighborhood in allData:
        allData[neighborhood] += val
    else:
        allData[neighborhood] = val

#Giving the data from our second dataset a weight of 20% since we feel it's less significant than the others
for neighborhood in source2:
    val = round(source2[neighborhood] * .2, 2)
    if neighborhood in allData:
        allData[neighborhood] += val
    else:
        allData[neighborhood] = val

#Giving the data from our third dataset a weight of 40% since we feel it's just as significant as the first
for neighborhood in source3:
    val = round(source3[neighborhood] * .4, 2)
    if neighborhood in allData:
        allData[neighborhood] += val
    else:
        allData[neighborhood] = val

#Cleaning the data by rounding the values to their second decimal point
for i in allData:
    allData[i] = round(allData[i], 2)
        
print(allData)

{'Fairywood': 98.0, 'New Homestead': 79.24, 'East Carnegie': 78.91, 'Ridgemont': 78.95, 'St. Clair': 98.33, 'Hays': 97.58, 'Chartiers City': 98.41, 'Summer Hill': 98.2, 'Oakwood': 98.01, 'Swisshelm Park': 98.36, 'Mt. Oliver': 58.6, 'Glen Hazel': 96.27, 'Regent Square': 98.11, 'Chateau': 92.76, 'Arlington Heights': 96.28, 'Windgap': 95.67, 'Bon Air': 94.53, 'Polish Hill': 95.51, 'Allegheny West': 95.34, 'Esplen': 95.1, 'Lincoln Place': 94.78, 'Spring Garden': 94.71, 'South Shore': 93.68, 'West End': 94.18, 'Duquesne Heights': 93.96, 'Westwood': 94.09, 'Bedford Dwellings': 85.88, 'Upper Hill': 90.99, 'North Shore': 87.88, 'Friendship': 94.48, 'Arlington': 90.95, 'Homewood West': 85.64, 'Banksville': 92.76, 'Point Breeze North': 91.95, 'California-Kirkbride': 90.65, 'Northview Heights': 85.66, 'Stanton Heights': 92.41, 'Morningside': 91.97, 'West Oakland': 91.44, 'Allegheny Center': 82.59, 'Point Breeze': 90.19, 'Overbrook': 89.7, 'Manchester': 85.93, 'Fineview': 82.6, 'Beltzhoover': 86.5