In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import folium
import math
import matplotlib.pyplot as plt
import plotly.figure_factory as ff
from src import helper_functions as hlp
import censusdata
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.precision', 2)
import statsmodels.formula.api as sm
from folium.plugins import HeatMap
pd.set_option('display.max_columns', 59)

# folium heat map (done)
# more shootings per capita // regions (imported census data 2017)
# predict # of people killed, by state, region, poverty (thurs)
# regression model (thurs)
# population of area (rural or urban) compare/contrast (wed)

In [None]:
#read csv
data = pd.read_csv('data/fatal-police-shootings-data.csv')
data

In [None]:
data.age.max()

## Cleaning Data

In [None]:
#check null values
hlp.assess_NA(data)

In [None]:
#replace null values
data['race'] = data['race'].fillna('NR')
data['flee'] = data['flee'].fillna('Not Reported')
data['age'] = data['age'].fillna(data.age.mean())
data['armed'] = data['armed'].fillna('not reported')
data['name'] = data['name'].fillna('No Name')

In [None]:
hlp.assess_NA(data)

In [None]:
data.dtypes

In [None]:
data.date = pd.to_datetime(data.date)

In [None]:
data.dtypes

In [None]:
data['age'].min()

In [None]:
#create new series with city and state to get coordinates for every point
new = data['state'].copy()
data["location"] = data["city"].str.cat(new, sep =", ")
data

In [None]:
#get county from city, state
locator = Nominatim(user_agent='myGeocoder')
location = locator.geocode(data.location[100])
location[0].split(', ')[1]
location

In [None]:
data['location'] = data['location'].str.replace('CA', 'California', regex=False)

In [None]:
data['location'] = data['location'].str.replace('AR', 'Arkansas', regex=False)
data['location'] = data['location'].str.replace('OH', 'Ohio', regex=False)
data['location'] = data['location'].str.replace('CO', 'Colorado', regex=False)
data['location'] = data['location'].str.replace('LA', 'Louisiana', regex=False)
data['location'] = data['location'].str.replace('ME', 'Maine', regex=False)
data['location'] = data['location'].str.replace('Harrison Township', 'Dayton', regex=False)
data.location[2928] = 'East Ridge, TN'

In [None]:
data.location[4958] = "Powhatan Point, OH"

In [None]:
# 1 - convinient function to delay between geocoding calls
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)

geolocator = Nominatim(user_agent='Geo')

In [None]:
#fill latitude and longitude na
for i in range(5716):
    #print(i)
    if math.isnan(data.latitude[i]):
        point = geolocator.geocode(data["location"][i])[1]
        data.latitude[i] = point[0]
        data.longitude[i] = point[1]

In [None]:
point = geolocator.geocode("Dayton, Ohio")
point[1]

data.latitude[5117] = point[1][0]
data.longitude[5117] = point[1][1]
data.iloc[5117]

## Generating Maps

In [None]:
#heatmap
def generateBaseMap(default_location=[37.0902, -95.7129], default_zoom_start=4.5):
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
    return base_map

In [None]:
#heatmap
data_copy = shootings_final.copy()
data_copy['count'] = 1
base_map = generateBaseMap()
HeatMap(data=data_copy[['latitude', 'longitude', 'count']].groupby(['latitude', 'longitude']).sum().reset_index().values.tolist(), radius=15, max_zoom=12).add_to(base_map)
base_map

In [None]:
#US coordinates
m = folium.Map(location=[37.0902, -95.7129],zoom_start=4)
#tooltip = 'Click me!'
for i in range(5716):
        if data.gender[i] == 'M':
            folium.CircleMarker([data.latitude[i], data.longitude[i]], popup=f'<i>{data.location[i]}</i>',radius = 2,color = 'blue').add_to(m)
        else:
            folium.CircleMarker([data.latitude[i], data.longitude[i]], popup=f'<i>{i}</i>',radius = 2,color = 'red').add_to(m)

m

In [None]:
data.iloc[4958]

In [None]:
geolocator = Nominatim(user_agent="cop_shooting")
location = geolocator.reverse(f"{data['latitude'][0]},{data['longitude'][0]}")
location.raw

In [None]:
data.age.mean()

In [None]:
data.age.min()

In [None]:
data.age.max()

In [None]:
data.dtypes

In [None]:
data.date = pd.to_datetime(data['date'])

In [None]:
data.dtypes

In [None]:
data

In [None]:
plt.hist(data.date,bins=6)

In [None]:
census = pd.read_csv('data/acs2017_county_data.csv')
census.CountyId = census['CountyId'].astype(str)

In [None]:
census.dtypes

In [None]:
census[(census.values.ravel() == county).reshape(census.shape).any(1)]


In [None]:
#get state name

def full_state(code):
        states = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}
        if code in states:
            return states[code]
        else:
            return code

In [None]:
data['state_name'] = data['state'].apply(full_state)

In [None]:
data

In [None]:
#get county
locator = Nominatim(user_agent='tester')
coordinates = f'{data.latitude[0]}, {data.longitude[0]}'
location = locator.reverse(coordinates)
county = location.raw.get('address').get('county')
data

In [None]:
data = data.drop('county',axis=1)

In [None]:
#coordinates to strings

data['coordinates'] = data['latitude'].round(2).copy().astype(str)
new = data['longitude'].round(2).copy().astype(str)
data['coordinates'] = data["coordinates"].str.cat(new, sep =", ")
data

In [None]:
#get county
def get_county(coordinates):
    location = locator.reverse(coordinates)
    return location.raw.get('address').get('county')

In [None]:
#data['county'] = data['coordinates'].apply(get_county)
data

In [None]:
data['county'] = data.city.copy()

In [None]:
#add county
for i in range(5716):
    location = locator.reverse(data.coordinates[i])
    county = location.raw.get('address').get('county')
    data['county'][i] = county
    print(i)
    
data

In [None]:
#lookup data on census table
census[(census['State']=='California') & (census['County']=='San Francisco County')]

In [None]:
census['private/public_work_ratio'] = census['PrivateWork']/census['PublicWork']

In [None]:
census['private/public_work_ratio']

In [None]:
data.to_csv('data/shootings-with-county.csv')

# Data with county

In [None]:
shootings = pd.read_csv('data/shootings-with-county.csv')

In [None]:
test = shootings.merge(census,left_on=['state_name','county'],right_on=['State','County'])
test[['state_name','State','county','County','IncomePerCap']].head()

In [None]:
#populationpercounty
#groupbystate,county
#heatmapbypopulation

In [None]:
test.head()

In [None]:
test.groupby(['State','County']).count()

In [None]:
test = test.rename(columns={"Unnamed: 0": "count"})
test['count'] = 1
test

In [None]:
test.groupby(['State','County']).count()

In [None]:
shootings_final = test

# Graphs

In [None]:
sns.set_style("white")
plt.figure(figsize=(15,5))
sns.countplot(data=shootings_final, x='race')

plt.title("Total number of people killed, by race", fontsize=17)
handles = ["A", "W", "B", "NR", "H", "N", "O"]
labels = ["Asian", "While", "Black", "Not Reported", "Hispanic", "Native", "Other"]



plt.savefig('shootingsbyrace.png')

In [None]:
shootings_final.groupby('race').count()

In [None]:
prop_killed_per_race = [87/14674252.0,1160/38929319.0,926/50477594,71/2932248,2467/223553265,40/22579629]
x = ['A','B','H','N','W','O']
fig, ax = plt.subplots(figsize=(12,8))
plt.bar(x,prop_killed_per_race, color=['tab:blue','tab:green','tab:purple','tab:brown','tab:orange','tab:pink'])
plt.title('People killed in proportion to race population',size=18)
plt.savefig('raceproportion.png')

In [None]:
for i in shootings_final.groupby('race').count()['count']:
    print(i)

In [None]:
fig, ax = plt.subplots(figsize=(14,8))
prop_iter = iter(plt.rcParams['axes.prop_cycle'])
                 
plt.bar(shootings_final.groupby('race').sum().index,shootings_final.groupby('race').sum()['count'],color=next(prop_iter)['color'])
plt.legend()

In [None]:
df.groupby(['flee','race'])['id'].count()

In [None]:

df.groupby(['signs_of_mental_illness','race'])['id'].count().unstack('signs_of_mental_illness').plot.bar(figsize=(12,5))
plt.title('Reported Signs of Mental Illness, by race',size=15)
plt.savefig('mentalstats.png')

In [None]:
shootings_final.groupby(['year','body_camera'])['id'].count().unstack('body_camera').plot.bar(figsize=(12,5),color=['tab:green','tab:purple'])
plt.title("Body Camera, by year", size = 16)
plt.savefig('bodycamera.png')

In [None]:
shootings_final.loc[shootings_final['name'] == 'Breonna Taylor']

In [None]:
shootings_final.mean()

In [None]:

plt.figure(figsize=(15,5))
sns.color_palette("Paired")
sns.countplot(data=shootings_final, x='state')

plt.title("Total number of people killed, by state", fontsize=17)
plt.savefig('Shootingsbystate.png')

In [None]:
#shootings per population
counties_pop = shootings_final.groupby(['CountyId']).mean()['TotalPop']
counties_count = shootings_final.groupby(['CountyId']).count()['count']
counties_shootpc = counties_count/counties_pop
counties_shootpc.sort_values(ascending=False).head(50)

In [None]:
counties_shootpc = pd.concat([counties_shootpc,shootings_final.groupby(['State','County']).mean()['IncomePerCap']],axis=1)

In [None]:
counties_shootpc.sort_values(0,ascending=False)

In [None]:
counties_shootpc = pd.concat([counties_shootpc,shootings_final.groupby(['State','County']).mean()['Poverty']],axis=1)
counties_shootpc.groupby('State').mean().sort_values(0,ascending=False).head(50)

In [None]:
counties_shootpc = pd.concat([counties_shootpc,shootings_final.groupby(['State','County']).mean()['private/public_work_ratio']],axis=1)
counties_shootpc.sort_values(0,ascending=False)

In [None]:
shootings_final.loc[(shootings_final['County'] == 'De Baca County') & (shootings_final['State'] == 'New Mexico')]

In [None]:
shootings_final.loc[shootings_final['County'] == 'Douglas County']

# Get Fips Value for county map

In [None]:
shootings_final['CountyId']

In [None]:
fips = pd.read_csv('data/fips.csv', dtype=str)
fips.index.astype(int)
fips

In [None]:
test = shootings_final.merge(fips,left_on=['state','county'],right_on=['State','County'])
test = test.drop(labels=['County_y','State_y','Name'],axis=1)
test = test.drop(labels=['State_x','County_x'],axis=1)
df = test
df

In [None]:
df

# Shootings per Capita (by County)

In [None]:
spc = (df.groupby(['state_name','county','FIPS']).count()['count']/df.groupby(['state_name','county','FIPS']).mean()['TotalPop'])
spc = spc*100000
spc

In [None]:
coords = df.groupby(['state_name','county','FIPS']).mean()[['longitude','latitude']]
coords

In [None]:
spc_new = pd.concat([spc,coords], axis=1, join='outer', ignore_index=False, keys=None,
          levels=None, names=None, verify_integrity=False, copy=True)
spc_new.sort_values(0,ascending=False)

In [None]:
spc_new[['latitude', 'longitude', 0]].groupby(['latitude', 'longitude']).sum().reset_index().values.tolist()

In [None]:
data_copy = shootings_final.copy()
data_copy['count'] = 1
base_map = generateBaseMap()
HeatMap(data=data_copy[['latitude', 'longitude', 'count']].groupby(['latitude', 'longitude']).sum().reset_index().values.tolist(), radius=13, max_zoom=12).add_to(base_map)
base_map

In [None]:
base_map = generateBaseMap()
HeatMap(data=spc_new[['latitude', 'longitude', 0]].groupby(['latitude', 'longitude']).sum().reset_index().values.tolist(), radius=13, max_zoom=12).add_to(base_map)
base_map

In [None]:
fips.FIPS.astype(int)
fips.dtypes

In [None]:
fips = fips[fips['State']!='AS']
fips

In [None]:
spc_new.reset_index(level=2, inplace = True, col_level = 1)
spc_new.head(10)

In [None]:
import os
import json
import geopandas as gpd

In [None]:
cur_json = json.load(open('gz_2010_us_050_00_500k.json', encoding='ISO-8859-1'))
path,ext = os.path.splitext('gz_2010_us_050_00_500k.json')
new_path =path+"_new"+ext
with open(new_path,"w", encoding='utf-8') as jsonfile:
        json.dump(cur_json,jsonfile,ensure_ascii=False)
us_county = gpd.read_file(new_path, driver='GeoJSON')

In [None]:
m3 = folium.Map(location=[39, -96], zoom_start=4)
high_res_county_geo = os.path.relpath('gz_2010_us_050_00_500k_new.json') # from http://eric.clst.org/Stuff/USGeoJSON


In [None]:
#bins = df['Poverty'].quantile([0, 0.25, 0.5, 0.75, 1])
m3.choropleth(
 geo_data=high_res_county_geo,
 name='choropleth',
 data=spc_new,
 columns=['FIPS',0],
 key_on='properties.GEO_ID',
 fill_color='YlGnBu',
 fill_opacity=0.5,
 line_opacity=1,
 #legend_name='Poverty(%)',
)
folium.LayerControl().add_to(m3)
m3

In [None]:
df = shootings_final.groupby('CountyId').sum()
df['spc'] = df['count']/df['TotalPop']
df = df.reset_index()
df.sort_values('spc',ascending=False)

In [None]:
df.sort_values('spc')

In [None]:
df_copy[['latitude', 'longitude', 'countperpop']].groupby(['latitude', 'longitude']).sum().reset_index().values.tolist()

In [None]:
data_copy[['latitude', 'longitude', 'count']].groupby(['latitude', 'longitude']).sum().reset_index().values.tolist()

In [None]:
gender = shootings_final.groupby('gender').count()

In [None]:
#Shootings by Gender
labels = shootings_final.groupby('gender').sum().sort_values('count',ascending=False).index
values = shootings_final.groupby('gender').sum().sort_values('count',ascending=False)['count']
colors = ['darkgreen','salmon','b','b','b']
fig, ax = plt.subplots(1,1,figsize=(12,8))
ax.barh(labels,values,color=colors,linewidth=0)
ax.set_title('Total Number of people killed, by Gender', fontsize=18)
plt.ylabel('Gender')
plt.xlabel('Count')
plt.savefig('genderstats.png')

In [None]:
#shooting by race percentage
shootings_final.race.value_counts(normalize=True).round(2)

In [None]:
shootings_final.armed.value_counts(normalize=True).round(3)

In [None]:
sizes = shootings_final.armed.value_counts(normalize=True).round(3).head(4)
sizes['other'] = 1-shootings_final.armed.value_counts(normalize=True).round(3).head(4).sum()
labels = sizes.index
explode = (0, 0.1, 0, 0)  # only "explode" the 2nd slice (i.e. 'Hogs')

fig1, ax1 = plt.subplots(figsize=(7,7))
ax1.pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=False, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
ax1.set_title('Weapon held by Citizen',fontdict = {
        'color':  'Black',
        'weight': 'normal',
        'size': 20,
        })
plt.savefig('weaponheld.png')

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
#Other Weapons
others = shootings_final.armed
others = list(others)
others = [x for x in others if x != 'gun']
others = [x for x in others if x != 'knife']
others = [x for x in others if x != 'unarmed']
others = [x for x in others if x != 'not reported']
others

In [None]:
fig, (ax2) = plt.subplots(1,1,figsize=[17, 10])
wordcloud2 = WordCloud(width=1000,height=400).generate(" ".join(others))
ax2.imshow(wordcloud2,interpolation='bilinear')
ax2.axis('off')
ax2.set_title('Most Used Arms',fontsize=20)

In [None]:
shootings_final['date'] = pd.to_datetime(shootings_final['date'])

In [None]:
shootings_final

In [None]:
shootings_final['year'] = shootings_final.date.dt.year
shootings_final['month'] = shootings_final.date.dt.month
dates = df.groupby(['year','month']).sum().reset_index()
datesavg = dates.groupby('month').mean()
datesavg.reset_index(level=[0])
datesavg.index

In [None]:
x = datesavg.index
y = datesavg['count']
fig, ax = plt.subplots(1,1,figsize=(12,8))
plt.plot(x,y)
plt.title("Average Shootings per Month",size=20)
plt.savefig('averageshootingspermonth.png')

In [None]:
from matplotlib.dates import DateFormatter
date_form = DateFormatter("%m-%d")

In [None]:
labels = shootings_final.groupby('gender').sum().sort_values('count',ascending=False).index
values = shootings_final.groupby('gender').sum().sort_values('count',ascending=False)['count']
colors = ['darkgreen','salmon','b','b','b']
fig, ax = plt.subplots(1,1,figsize=(16,8))
ax.plot(date_df['count'])

plt.show()

In [None]:
span = pd.date_range(shootings_final.date.min(),shootings_final.date.max(),freq='m')
span


blah = pd.DataFrame(data = np.random.randn(len(span)), index=span)

fig, ax = plt.subplots()

ax.xaxis.set_major_formatter(DateFormatter('%m-%Y'))
ax.bar(blah.index, blah[0], width=25, align='center')

In [None]:
date_df = df.copy()
date_df.groupby('date').sum()

In [None]:
shootings_final[shootings_final['date'] <= '2015-10-10']

In [None]:
shootings_final.groupby('body_camera').count()

In [None]:
labels = list(shootings_final.groupby('body_camera').count().index)
labels[0] = 'No Camera'
labels[1] = 'Camera'
values = shootings_final.groupby('body_camera').count()['count']
colors = ['tab:green','tab:purple','b','b','b']
fig, ax = plt.subplots(1,1,figsize=(12,8))
ax.bar(labels,values,color=colors)
ax.set_title('Body Camera', fontsize=18)
plt.xticks()
plt.ylabel('Count')
plt.savefig('bodycamera.png')

In [None]:
shootings_final['age'].hist(bins=15,figsize=(12,8),color='gray',grid=False)
plt.title('Age of Victim',size=15)
plt.savefig('agehistogram.png')
#shootings_final.hist('age',bins=15,figsize=(12,8),grid=False,color='gray')



In [None]:
Camera_df = shootings_final[shootings_final['body_camera']==True]
Camera_df

In [None]:
noCamera_df = shootings_final[shootings_final['body_camera']==False]
noCamera_df[noCamera_df['age']==6]

In [None]:
labels = shootings_final.groupby('race').count()
labels

In [None]:
df.groupby(['armed','flee']).value_counts()

In [None]:
df['age'].m()

In [None]:
import plotly.figure_factory as ff
import plotly.graph_objects as go

In [None]:
spc_new

In [None]:
import plotly.figure_factory as ff

df_sample = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/laucnty16.csv')
df_sample['State FIPS Code'] = df_sample['State FIPS Code'].apply(lambda x: str(x).zfill(2))
df_sample['County FIPS Code'] = df_sample['County FIPS Code'].apply(lambda x: str(x).zfill(3))
df_sample['FIPS'] = df_sample['State FIPS Code'] + df_sample['County FIPS Code']
df_sample = df_sample.merge(spc_new,on='FIPS',how='left').fillna(0)

In [None]:
df_sample

In [None]:
colorscale = ["#f7fbff","#ebf3fb","#deebf7","#d2e3f3","#c6dbef","#b3d2e9","#9ecae1",
              "#85bcdb","#6baed6","#57a0ce","#4292c6","#3082be","#2171b5","#1361a9",
              "#08519c","#0b4083","#08306b"]
endpts = list(np.linspace(0,20,(len(colorscale) - 1)))
fips = df_sample['FIPS'].tolist()
values = df_sample[0].tolist()

fig = ff.create_choropleth(
    fips=fips, values=values,
    binning_endpoints=endpts,
    colorscale=colorscale,
    show_state_data=False,
    show_hover=True, centroid_marker={'opacity': 0},
    asp=2.9, title='Fatal Police Shootings by population (2015-Present)',
    legend_title='per 100,000 people'
)

fig.layout.template = None
fig.show()

In [None]:
totalkill = df.groupby('FIPS').count()['count'].reset_index()

In [None]:
df_sample = df_sample.merge(totalkill,on='FIPS',how='left').fillna(0)
df_sample

# Total Killings

In [None]:
colorscale = ["#f7fbff","#ebf3fb","#deebf7","#d2e3f3","#c6dbef","#b3d2e9","#9ecae1",
              "#85bcdb","#6baed6","#57a0ce","#4292c6","#3082be","#2171b5","#1361a9",
              "#08519c","#0b4083","#08306b"]
endpts = list(np.linspace(0,20,(len(colorscale) - 1)))
fips = df_sample['FIPS'].tolist()
values = df_sample['count'].tolist()

fig = ff.create_choropleth(
    fips=fips, values=values,
    binning_endpoints=endpts,
    colorscale=colorscale,
    show_state_data=False,
    show_hover=True, centroid_marker={'opacity': 0},
    asp=2.9, title='Fatal Police Shootings (2015-Present)',
    legend_title='Total'
)

fig.layout.template = None
fig.show()

In [None]:
df_sample['count'].describe()

# Test Rural and Urban rates

In [None]:
df_sample[0].describe()

In [None]:
df_sample

In [None]:
colorscale = ["#f7fbff","#ebf3fb","#deebf7","#d2e3f3","#c6dbef","#b3d2e9","#9ecae1",
              "#85bcdb","#6baed6","#57a0ce","#4292c6","#3082be","#2171b5","#1361a9",
              "#08519c","#0b4083","#08306b"]
endpts = list(np.percentile(df_sample[0].tolist(), np.linspace(0,1,(len(colorscale) - 1))))

In [None]:
fips = df_sample['FIPS'].tolist()

In [None]:
values = df_sample[0].tolist()

In [None]:
data.groupby(['flee','armed']).count().sort_values('id',ascending=False)['id'].value_counts(normalize=True)

# Fleeing and Not Armed

In [None]:
193/5716

In [None]:
notflee = data[data['flee']=='Not fleeing']
notflee

In [None]:
notflee[notflee['armed']=='unarmed'].groupby('state').count().sort_values('id')

In [None]:
193/3567

In [None]:
df[df['county']=='Kiowa County']

In [None]:
spc.sort_values(ascending=False)

In [None]:
countycounts = df.groupby(['state','county']).sum()['count']
countycounts

In [None]:
poverty=df.groupby(['state','county']).mean()['Poverty']
poverty

In [None]:
df.groupby(['FIPS']).mean()

In [None]:
plt.scatter()

In [None]:
%matplotlib inline

from math import ceil
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt
font = {'weight' : 'bold',
        'size'   : 18}


In [None]:
counts = df.groupby(['state','county']).sum()['count']
counts

In [None]:
pop = df.groupby(['state','county']).mean()['TotalPop']
pop.shape

In [None]:
new = pd.concat([counts, pop],axis=1)
new['reg'] = 1
new

In [None]:
new['count'].hist(bins=100,figsize=(14,8))

In [None]:
plt.scatter(new.TotalPop,new['count'],)

In [None]:
shootings_model = smf.glm(
    data = new,
    formula='count ~ TotalPop + reg')
mod = shootings_model.fit()

mod.summary()



In [None]:
more_stats = df.groupby(['state','county']).mean()[['IncomePerCap','Poverty','ChildPoverty','Unemployment']]
more_stats
new = pd.concat([new, more_stats],axis=1)
new

In [None]:
shootings_model = smf.glm(
    data = new,
    formula='count ~ IncomePerCap + Poverty + ChildPoverty + Unemployment + reg')
mod = shootings_model.fit()

mod.summary()


# Sources

population2017 - data.census.gov, kaggle

shootings - washington post, kaggle

In [None]:
noflee = shootings_final[shootings_final['flee']=='Not fleeing']

In [None]:
womannoflee = noflee[noflee['gender']=='F']

In [None]:
womannoflee[womannoflee['armed']=='unarmed']

In [None]:
shootings_final.groupby('CountyId').mean()

In [None]:
pop = df.groupby(['state_name','county','FIPS']).mean()['TotalPop']

In [None]:
drive = df.groupby(['state_name','county','FIPS']).mean()['Drive']

In [None]:
pov = df.groupby(['state_name','county','FIPS']).mean()['Poverty']

In [None]:
spc_new = pd.concat([spc_new,pop], axis=1, join='outer', ignore_index=False, keys=None,
          levels=None, names=None, verify_integrity=False, copy=True)

In [None]:
spc_new.corr()

In [None]:
plt.scatter(spc_new['IncomePerCap'],spc_new[0])

In [None]:
plt.scatter(spc_new['Poverty'],spc_new[0])

In [None]:
spc_new['count'] = 1
spc_new = spc_new.rename({0:'ShootPerCap'},axis=1)
spc_new

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

In [None]:
m_log_reg = sm.OLS(spc_new['ShootPerCap']*100000,spc_new[['ShootPerCap','TotalPop','IncomePerCap','count']])
m_log_reg = m_log_reg.fit()
m_log_reg.summary()

In [None]:
regressor = sm.OLS(spc_new['ShootPerCap']*100000,spc_new[['ShootPerCap','TotalPop','IncomePerCap','count']])
regressor = regressor.fit()
regressor.summary()


In [None]:
X = spc_new[['ShootPerCap','TotalPop','IncomePerCap','count']]
xx = np.linspace(1000, 5500, 100)

In [None]:
fig = sm.graphics.plot_partregress_grid(regressor)
fig.tight_layout(pad=1.0)

In [None]:
fig = sm.graphics.plot_ccpr(regressor, "IncomePerCap")
fig.tight_layout(pad=0.5)

In [None]:
from statsmodels.formula.api import ols


In [None]:
crime_model = ols("ShootPerCap ~ TotalPop + IncomePerCap", data=spc_new).fit()
print(crime_model.summary())



In [None]:
fig = sm.graphics.plot_partregress_grid(crime_model)
fig.tight_layout(pad=2.0)