<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np


import altair as alt
import re
import json
from pandas.io.json import json_normalize

alt.renderers.enable('notebook')
%matplotlib inline

**Load the base crime data "crimedata.csv"**

In [None]:

cc = pd.read_csv('crimedata_new.csv', header=0, na_values="?", encoding='latin-1')

In [None]:
cc.head()

In [None]:
cc.describe()

**Load the numeric codes (FIPS) for each state from the "us-state-ansi-fips.csv" and merge it with the base crime dataframe 'cc'. FIPS state codes are retrived from https://en.wikipedia.org/wiki/Federal_Information_Processing_Standard_state_code**

In [None]:
stfips = pd.read_csv('us-state-ansi-fips.csv', header =0)
stfips.head()

In [None]:
cc['Total Crimes'] = cc['ViolentCrimesPerPop']+cc['nonViolPerPop']

In [None]:
cc = pd.merge(cc, stfips, left_on=["state"], right_on=["stusps"])
cc.head()

**Remove the word 'city' appended to the community name**

In [None]:
cc['communityname'] = cc['communityname'].map(lambda result : re.sub(r'city','',result))
cc.head()

**Load Latitude and Logitude at city level from "cities.jon" file**

In [None]:
file = 'cities.json'
with open(file) as city_file:
   dict_city = json.load(city_file)

cities_lat_lon = json_normalize(dict_city)
cities_lat_lon.head()

**Merge the city level lat and lon with the base crime dataframe cc and load it into a new dataframe "crime_latlon_city"**

In [None]:
crime_latlon_city = pd.merge(cc, cities_lat_lon, left_on=["communityname", "state"], right_on=["city", "state"])
crime_latlon_city.head()

**What is the highest correlation betweent the violent and non violent crime rate against each of its categories**

In [None]:
cc_viol = cc[['murdPerPop','rapesPerPop','robbbPerPop','assaultPerPop','ViolentCrimesPerPop']]
cc_viol_corr = cc_viol.corr()

cc_nonviol = cc[['burglPerPop','larcPerPop','autoTheftPerPop','arsonsPerPop','nonViolPerPop']]
cc_nonviol_corr = cc_nonviol.corr()

### correlation matrix plot
plt.figure(figsize=(18,5))

plt.subplot(1,2,1)
plt.title("Correlation b/w ovreall non violent crime rate with its categories")
sns.heatmap(cc_nonviol_corr,annot=True,annot_kws={"size": 7},vmin=-1.0, vmax=1.0,cmap="Spectral")

plt.subplot(1,2,2)
plt.title("Correlation b/w ovreall violent crime rate with its categories")
sns.heatmap(cc_viol_corr,annot=True,annot_kws={"size": 7},vmin=-1.0, vmax=1.0,cmap="Spectral")

Chart to show pairwise co-relation of the 5 highest co-related attributes with Violent Crime Per 100K population 

In [None]:
cc_crimecat_removed = cc.drop(['burglPerPop', 'larcPerPop', 'autoTheftPerPop',  'arsonsPerPop', 'murdPerPop',
                              'rapesPerPop', 'robbbPerPop', 'assaultPerPop','Total Crimes'], axis=1)
corr_matrix = cc_crimecat_removed.corr()

corr_matrix=corr_matrix.reindex(corr_matrix.nonViolPerPop.abs().sort_values(ascending=False).index)

corr_matrix.nonViolPerPop[:5]

In [None]:
#Most correlating factors for non violent crimes


# Attributes of interest with
cols = corr_matrix.nonViolPerPop[:5].index

pp = sns.pairplot(cc[cols], 
                  size=1.8, aspect=1.2,                  
                  diag_kws=dict(shade=True), # "diag" adjusts/tunes the diagonal plots
                  diag_kind="kde",
                     kind="reg") # use "kde" for diagonal plots

fig = pp.fig 
fig.subplots_adjust(top=0.93, wspace=0.3)
fig.suptitle('Top 5 high co-related attributes for non violent crime Pairwise Plots', 
              fontsize=14, fontweight='bold')

Results: what have you found? how do you visualize your results?

-- ViolentCrimesPerPop: total number of violent crimes per 100K popuation (numeric - decimal) GOAL attribute (to be predicted) 
 -- PctFam2Par: percentage of families (with kids) that are headed by two parents (numeric - decimal) 
-- PctKids2Par: percentage of kids in family housing with two parents (numeric - decimal) 
-- PctTeen2Par: percent of kids age 12-17 in two parent households (numeric - decimal) 
-- PctYoungKids2Par: percent of kids 4 and under in two parent households (numeric - decimal) 
 
 
 1. Communities have high positive co-relation among Non violent crime and violent crime rate, i.e. the communities with high Violent crime will have high non violent crime too. 
 2. Communities with high percentage of families with two parents with kids or teens have a strong negative impact on violent crime. 
 
 Using scatter matrix , we have visualized the result , we also added a regression line , which helps in quick interpretation of the co-relation. 
 
Insights: what kinds of insights did you get from the visualizations?

From the visualization , we quickly got the insight of the negative corelation and impacts of families to the violence. 

# Add chart for  violent pair wise comparison , same as above

In [None]:
corr_matrix=corr_matrix.reindex(corr_matrix.ViolentCrimesPerPop.abs().sort_values(ascending=False).index)

corr_matrix.ViolentCrimesPerPop[:5]

#Most correlating factors for  violent crimes


# Attributes of interest with
cols = corr_matrix.ViolentCrimesPerPop[:5].index

pp = sns.pairplot(cc[cols], 
                  size=1.8, aspect=1.2,                  
                  diag_kws=dict(shade=True), # "diag" adjusts/tunes the diagonal plots
                  diag_kind="kde",
                     kind="reg") # use "kde" for diagonal plots

fig = pp.fig 
fig.subplots_adjust(top=0.93, wspace=0.3)
fig.suptitle('Top 5 high co-related attributes for Violent Crime Pairwise Plots', 
              fontsize=14, fontweight='bold')

Results: what have you found? how do you visualize your results?

-- ViolentCrimesPerPop: total number of violent crimes per 100K popuation (numeric - decimal) GOAL attribute (to be predicted) 
 -- PctFam2Par: percentage of families (with kids) that are headed by two parents (numeric - decimal) 
-- PctKids2Par: percentage of kids in family housing with two parents (numeric - decimal) 
-- PctTeen2Par: percent of kids age 12-17 in two parent households (numeric - decimal) 
-- PctYoungKids2Par: percent of kids 4 and under in two parent households (numeric - decimal) 

 
 
 1. Communities have high positive co-relation among Non violent crime and violent crime rate.
 2. Communities with high percentage of families with two parents with kids or teens have a strong negative impact on violent crime. 
 3. Communities with high percentage of Kids with never married parent have a high positive co-relation with violent crime.
 
 Using scatter matrix , we have visualized the result , we also added a regression line , which helps in quick interpretation of the co-relation. 
 
Insights: what kinds of insights did you get from the visualizations?

From the visualization , we quickly got the insight of the negative corelation and impacts of families to the violence. Also percentage of kids born to never married parents has high corelation with violent crime.  

**Finding top contributing factors for the non violent and violent crimes. To do this, remove the non violent crimes burglperpop, larcperpo, autotheftperpo, arsonsperpo, nonviolperpo and the violent crimes murderperpo, rapesperpop, robbperpop, assultperpop as they are the individual categories for the non violent and violent crimes we need to find the other factors contributing to the crimes**

In [None]:
cc_crimecat_removed = cc.drop(['burglPerPop', 'larcPerPop', 'autoTheftPerPop',  'arsonsPerPop', 'murdPerPop',
                              'rapesPerPop', 'robbbPerPop', 'assaultPerPop'], axis=1)
corr_matrix = cc_crimecat_removed.corr()

In [None]:
#Chart to show absolute magnitude of the top 10 co-related attributes with Violent and non violent crime per 100K population 

corr_matrix['sort'] = corr_matrix.ViolentCrimesPerPop.abs()
top_10_corr_violent_crime = corr_matrix.sort_values('sort',ascending=False)['ViolentCrimesPerPop'][1:11]

corr_matrix['sort'] = corr_matrix.nonViolPerPop.abs()
top_10_corr_nonviolent_crime = corr_matrix.sort_values('sort',ascending=False)['nonViolPerPop'][1:11]


df_violent = top_10_corr_violent_crime.to_frame().reset_index()

top_10_corr_violent_crime_chart=alt.Chart(df_violent).mark_bar().encode(
    x=alt.X('ViolentCrimesPerPop:Q'),  
    y=alt.Y('index:N',sort=alt.SortField(field="ViolentCrimesPerPop", op="distinct", order='ascending'))   
)



top_10_corr_violent_crime_chart.encoding.x.title = 'Violent crime per 100K population correlation coefficient'
top_10_corr_violent_crime_chart.encoding.y.title = 'Attributes'
top_10_corr_violent_crime_chart.title='Top 10 correlated attributes with Violent Crime Per 100K population'


df_nonviolent = top_10_corr_nonviolent_crime.to_frame().reset_index()

top_10_corr_nonviolent_crime_chart=alt.Chart(df_nonviolent).mark_bar().encode(
    x='nonViolPerPop:Q',  
    y=alt.Y('index:N',sort=alt.SortField(field="nonViolPerPop", op="distinct", order='ascending'))   
)

top_10_corr_nonviolent_crime_chart.encoding.x.title = 'Non violent crime per 100K population correlation coefficient'
top_10_corr_nonviolent_crime_chart.encoding.y.title = 'Attributes'
top_10_corr_nonviolent_crime_chart.title='Top 10 correlated attributes with Non violent crime Per 100K population'


top_10_corr_violent_crime_chart & top_10_corr_nonviolent_crime_chart



Results: what have you found? how do you visualize your results?

-- ViolentCrimesPerPop: total number of violent crimes per 100K popuation (numeric - decimal) GOAL attribute (to be predicted) 
 -- PctFam2Par: percentage of families (with kids) that are headed by two parents (numeric - decimal) 
-- PctKids2Par: percentage of kids in family housing with two parents (numeric - decimal) 
-- PctTeen2Par: percent of kids age 12-17 in two parent households (numeric - decimal) 
-- PctYoungKids2Par: percent of kids 4 and under in two parent households (numeric - decimal) 
PctKidsBornNeverMar - percentage of kids born to never married parents
-- racepctblack: percentage of population that is african american (numeric - decimal) 
-- racePctWhite: percentage of population that is caucasian (numeric - decimal) 
-- racePctAsian: percentage of population that is of asian heritage (numeric - decimal) 
-- racePctHisp: percentage of population that is of hispanic heritage (numeric - decimal) 
-- pctWPubAsst: percentage of households with public assistance income in 1989 (numeric - decimal) 
-- pctWInvInc: percentage of households with investment / rent income in 1989 (numeric - decimal) 
-- MalePctDivorce: percentage of males who are divorced (numeric - decimal) 
-- MalePctNevMarr: percentage of males who have never married (numeric - decimal) 
-- FemalePctDiv: percentage of females who are divorced (numeric - decimal) 
-- TotalPctDiv: percentage of population who are divorced (numeric - decimal) 
-- PctPopUnderPov: percentage of people under the poverty level (numeric - decimal) 

 Violent Crime Tornado chart 
 
 1. Communities have high positive co-relation among violent crime rate and non violent , kids with never married parents, 
 percentage of black population and percentage of households with public assistance income.
 
 2. Communities with high percentage of families with two parents with kids or teens have a strong negative impact
 on violent crime. 
 3. Communities with high percentage of white race and households with investment income have strong negative impact on violent crime.
 
 Using tornado chart on the absolute value of co-relation help to get this information quickly.  
 
Insights: what kinds of insights did you get from the visualizations?

From the visualization , we got the quick comparative ordered study of all the factors impacting violent and non - violent crime.  

In [None]:
#Most correlating factors for non violent crimes
corr_matrix=corr_matrix.reindex(corr_matrix.nonViolPerPop.abs().sort_values(ascending=False).index)

corr_matrix.nonViolPerPop[:10]

In [None]:
#Most correlating factors for violent crimes
corr_matrix['ViolentCrimesPerPop'].sort_values(ascending=False)

**Choropleth map for non violent and violent crime rates at state level**

In [None]:
# Aggregated Non-Violent & Violent Crimes Per 100k Population at each State

counties = alt.topo_feature(data.us_10m.url, 'counties')
states = alt.topo_feature(data.us_10m.url, 'states')

variable_list = ['nonViolPerPop', 'ViolentCrimesPerPop']

# violent rate
chart = alt.Chart(states).mark_geoshape().project(
    type='albersUsa'
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(cc, 'st', variable_list)
).encode(
    alt.Color(alt.repeat('row'), type='quantitative')
).properties(
    title='Non-Violent & Violent Crimes Per 100k Population at each State',
    width=400,
    height=400
).repeat(
    row=variable_list
).resolve_scale(
    color='independent'
)

chart
#chart1
#chart1 & chart2

**Choropleth maps for the top 20 cities/communities with highest non violent and violent crime rates**

**Explanation: ** For violent crimes, there are some cities like Chester, Atlanta, Newark, Alexandria and Miami are reported with high no. of violent crimes. For non-violent crimes, there are some cities like East Longmeadow, Atlantic, Myrtle Beach, Palatka and Miami Beach are reported with the highest no. of non-violent crimes.

In terms of total no. of crimes (violent and non-violent), the top reported cities are Atlantic, East Longmeadow, Palatka, Dania and Miami.


In [None]:
top_20_cities_violent = cc.sort_values('ViolentCrimesPerPop',ascending=False).head(20)
top_20_cities_Non_violent = cc.sort_values('nonViolPerPop',ascending=False).head(20)
top_20_cities_total_crimes = cc.sort_values('Total Crimes',ascending=False).head(20)


In [None]:
#One Dot Per community for violent crimes
violentChart = alt.Chart(top_20_cities_violent).mark_circle(size=4).encode(
    longitude='City_Long:Q',
    latitude='City_Lat:Q',
    size='ViolentCrimesPerPop:Q',
    color='CityNew:N',
    #color='state:N',
    tooltip='CityNew:N'
).project(
    type='albersUsa'
).properties(
    width=800,
    height=500)

# US states background
background = alt.Chart(states).mark_geoshape(
    fill='white',
    stroke='black',
    strokeWidth=0.5
).properties(
    title='Top 20 Cities with highest Violent Crime',
    width=800,
    height=500
).project('albersUsa')

background + violentChart

In [None]:
#One Dot Per community for non violent crimes
violentChart = alt.Chart(top_20_cities_Non_violent).mark_circle(size=4).encode(
    longitude='City_Long:Q',
    latitude='City_Lat:Q',
    size='nonViolPerPop:Q',
    color='CityNew:N',
    #color='state:N',
    tooltip='CityNew:N'
).project(
    type='albersUsa'
).properties(
    width=800,
    height=500)

# US states background
background = alt.Chart(states).mark_geoshape(
    fill='white',
    stroke='black',
    strokeWidth=0.5
).properties(
    title='Top 20 Cities with highest Non Violent Crimes',
    width=800,
    height=500
).project('albersUsa')

background + violentChart

**Choropleth maps for the top 20 cities/communities with highest non violent and violent crime rates**

In [None]:
#Choropleth maps for top 20 cities with highest total no. of  crimes
violentChart = alt.Chart(top_20_cities_total_crimes).mark_circle(size=4).encode(
    longitude='City_Long:Q',
    latitude='City_Lat:Q',
    size='Total Crimes:Q',
    color='CityNew:N',
    #color='state:N',
    tooltip='CityNew:N'
).project(
    type='albersUsa'
).properties(
    width=800,
    height=500)

# US states background
background = alt.Chart(states).mark_geoshape(
    fill='white',
    stroke='black',
    strokeWidth=0.5
).properties(
    title='Top 20 Cities with highest total no. of (violent and non-voilent) Crimes ',
    width=800,
    height=500
).project('albersUsa')

background + violentChart

**One Dot map for the all communities with non violent and violent crime rates**

In [None]:
#One Dot map for all communities with total no. of crimes
import altair as alt

alt.Chart(cc).mark_circle(size=4).encode(
    longitude='City_Long:Q',
    latitude='City_Lat:Q',
    size='ViolentCrimesPerPop:Q',
    color='ViolentCrimesPerPop:Q',
    tooltip='CityNew:N'
).project(
    type='albersUsa'
).properties(
    title='all communities with  total no. of (violent and non-voilent) crimes ',
    width=800,
    height=500)

In [None]:
#Making word cloud for States based on the number of voilent and non voilent crime 

crimedata_state_violent = cc.groupby('stname').agg({'ViolentCrimesPerPop':'mean'})[['ViolentCrimesPerPop']].reset_index()
crimedata_state_nonviolent = cc.groupby('stname').agg({'nonViolPerPop':'mean'})[['nonViolPerPop']].reset_index()
crimedata_state_violent.dropna(inplace=True)
crimedata_state_nonviolent.dropna(inplace=True)



In [None]:
state_avg_violent_crime={}
for index,row in crimedata_state_violent.iterrows():   
   state_avg_violent_crime[row['stname']]=int(row['ViolentCrimesPerPop']);

state_avg_nonviolent_crime={}
for index,row in crimedata_state_nonviolent.iterrows():   
   state_avg_nonviolent_crime[row['stname']]=int(row['nonViolPerPop']);



In [None]:
bars = alt.Chart(crimedata_state_violent).mark_bar().encode(
    x=alt.X('stname'),
    y=alt.Y('ViolentCrimesPerPop')
) 

bars.encoding.x.title = 'States'
bars.encoding.y.title = 'Average violent crime (per 100K population)'
bars.title='State and average violent crime (per 100K population)'
bars


In [None]:
import wordcloud
wc_violent = wordcloud.WordCloud(width=1000, height=500)
wc_violent.generate_from_frequencies(state_avg_violent_crime)

wc_nonviolent = wordcloud.WordCloud(width=1000, height=500)
wc_nonviolent.generate_from_frequencies(state_avg_nonviolent_crime)

In [None]:
plt.figure(figsize=(20,10))
plt.imshow(wc_violent, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud for State name based on average Violent Crime(per 100K population)")

Results: what have you found? how do you visualize your results?


Word Cloud  

Quick to find out the states with high and low violent crime. 

Insights: what kinds of insights did you get from the visualizations?


In [None]:
bars = alt.Chart(crimedata_state_nonviolent).mark_bar().encode(
    x=alt.X('stname'),
    y=alt.Y('nonViolPerPop' ),
    order = 'nonViolPerPop'
) 


bars.encoding.x.title = 'States'
bars.encoding.y.title = 'Average non violent crime (per 100K population)'
bars.title='State and average non violent crime (per 100K population)'

bars


In [None]:
plt.figure(figsize=(20,10))
plt.imshow(wc_nonviolent, interpolation="bilinear")
plt.axis("off")

plt.title("Word Cloud for State name based on average non violent Crime(per 100K population)")

In [None]:
#State wise Violent crime 

cc_viol_cat = cc[['murdPerPop','rapesPerPop','robbbPerPop','assaultPerPop','ViolentCrimesPerPop', 'state']]
cc_nonviol_cat = cc[['burglPerPop','larcPerPop','autoTheftPerPop','arsonsPerPop','nonViolPerPop', 'state']]

cc_viol_cat_by_st = (cc_viol_cat.groupby('state').agg('mean')).sort_values('ViolentCrimesPerPop', ascending=False)
cc_nonviol_cat_by_st = (cc_nonviol_cat.groupby('state').agg('mean')).sort_values('nonViolPerPop', ascending=False)


cc_viol_cat_by_st10 = cc_viol_cat_by_st[:5].reset_index()
cc_nonviol_cat_by_st10 = cc_nonviol_cat_by_st[:5].reset_index()


cc_viol_cat_by_st_tidy = pd.melt( cc_viol_cat_by_st10, id_vars=['state','ViolentCrimesPerPop'], value_vars=['murdPerPop', 'rapesPerPop', 
                                                                                        'robbbPerPop', 'assaultPerPop'],
                                 var_name='violCrimeCat', value_name='Violent crime')


cc_nonviol_cat_by_st_tidy = pd.melt( cc_nonviol_cat_by_st10, id_vars=['state','nonViolPerPop'], value_vars=['burglPerPop','larcPerPop',
                                                                                              'autoTheftPerPop','arsonsPerPop'],
                                 var_name='nonviolCrimeCat', value_name='Non violent crime')




violent_category_stack_chart = alt.Chart(cc_viol_cat_by_st_tidy).mark_bar().encode(
    alt.X('state:O', axis=alt.Axis(labelAngle=0)),
    alt.Y('Violent crime:Q'),
    order='ViolentCrimesPerPop:Q',
    color='violCrimeCat'
).properties(
    width=400,
    height=350
)

violent_category_stack_chart.title = "Violent crime category stacked chart for top 5 states"

violent_category_stack_chart

In [None]:
cc_viol_cat_by_st_tidy

In [None]:


non_violent_category_stack_chart = alt.Chart(cc_nonviol_cat_by_st_tidy).mark_bar().encode(
    alt.X('state:O', axis=alt.Axis(labelAngle=0) ),
    alt.Y('Non violent crime:Q'),    
    color='nonviolCrimeCat',
    order='nonViolPerPop:Q'
).properties(
    width=400,
    height=350
)

non_violent_category_stack_chart.title = "Non violent crime category stacked chart for top 5 states"

non_violent_category_stack_chart



In [None]:
#Parallel co-ordinate for 5 states for violent crime 10 high co-related attributes 


cc_top10st = cc[cc.state.isin(['DC', 'DE', 'FL', 'GA', 'IA'])]

cc_viol_cor = cc_top10st[['state', 'PctKidsBornNeverMar','racepctblack','pctWPubAsst','TotalPctDiv','FemalePctDiv', 
                          'MalePctDivorce', 'PctPopUnderPov', 'PctUnemployed', 'PctVacantBoarded', 'PctHousNoPhone', 
                          'PctNotHSGrad', 'PctHousLess3BR']]

cc_nonviol_cor = cc_top10st[['state', 'PctKidsBornNeverMar','racepctblack','pctWPubAsst','TotalPctDiv','FemalePctDiv', 
                          'MalePctDivorce', 'PctPopUnderPov', 'PctUnemployed', 'PctVacantBoarded', 'PctHousNoPhone', 
                          'PctNotHSGrad', 'PctHousLess3BR']]

#cc_viol_cor

cc_viol_cor_by_st_tidy = pd.melt( cc_viol_cor, id_vars=[('state')], value_vars=['PctKidsBornNeverMar','racepctblack',
                                                                                'pctWPubAsst','TotalPctDiv', 'FemalePctDiv', 
                                                                                'MalePctDivorce', 'PctPopUnderPov', 
                                                                                'PctUnemployed', 'PctVacantBoarded',
                                                                                'PctHousNoPhone', 'PctNotHSGrad',
                                                                                'PctHousLess3BR'],
                                 var_name='violCrimeCor', value_name='violcrimeRate')

cc_nonviol_cor_by_st_tidy = pd.melt( cc_nonviol_cor, id_vars=[('state')], value_vars=['PctKidsBornNeverMar','racepctblack',
                                                                                'pctWPubAsst','TotalPctDiv', 'FemalePctDiv', 
                                                                                'MalePctDivorce', 'PctPopUnderPov', 
                                                                                'PctUnemployed', 'PctVacantBoarded',
                                                                                'PctHousNoPhone', 'PctNotHSGrad',
                                                                                'PctHousLess3BR'],
                                 var_name='nonviolCrimeCor', value_name='nonviolcrimeRate')



top_5_state_non_violent_high_co_related = alt.Chart(cc_nonviol_cor_by_st_tidy).mark_line().encode(
    x='nonviolCrimeCor:N',
    y='nonviolcrimeRate:Q',
    color='state:N',
    detail='state:N'#,
    #opacity=alt.value(0.5)
).properties(width=850,height=600)

top_5_state_non_violent_high_co_related.title = "Parallel co-ordinate for 5 states for non violent crime 10 high co-related attributes"

top_5_state_non_violent_high_co_related

In [None]:

top_5_state_violent_high_co_related = alt.Chart(cc_viol_cor_by_st_tidy).mark_line().encode(
    x='violCrimeCor:N',
    y='violcrimeRate:Q',
    color='state:N',
    detail='state:N'#,
    #opacity=alt.value(0.5)
).properties(width=850,height=600)

top_5_state_violent_high_co_related.title = "Parallel co-ordinate for 5 states for violent crime 10 high co-related attributes"

top_5_state_violent_high_co_related