# INTRODUCTION

This was created by following a tutorial. 

Using the given data, we will see how we can use the tools which seaborn provides.
* [Bar Plot](#1)
* [Point Plot](#2)
* [Joint Plot](#3)
* [Pie Chart](#4)
* [Lm Plot](#5)
* [Kde Plot](#6)
* [Violin Plot](#7)
* [Heatmap](#8)
* [Box Plot](#9)
* [Swarm Plot](#10)
* [Pair Plot](#11)
* [Count Plot](#12)
    


We will try to find the answers to:

1. How do we read the given data
1. Poverty rate of each state
1. Most common names/surnames of shot people
1. High school graduation rate
1. Percentage of each states population in relation to race
1. High school graduation rate vs. Poverty rate of each state
1. Police shooting properties:
    *  Manner of death
    *  Murder weapon
    *  Age of victim
    *  Race of victim
    *  Most dangerous cities
    *  Most dangerous states
    *  Victim's mental state
    *  Types of threat
    *  Types of fleeing
    *  Police had bodycam or not
1. Race rates by state in shooting data
1. Shooting numbers from states
1. 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
%matplotlib inline

from subprocess import check_output
print(check_output(["ls", "../input/fatal-police-shootings-in-the-us"]).decode("utf8"))

In [None]:

median_household_income = pd.read_csv('../input/fatal-police-shootings-in-the-us/MedianHouseholdIncome2015.csv', encoding="windows-1252")

percentage_ppl_below_poverty = pd.read_csv('../input/fatal-police-shootings-in-the-us/PercentagePeopleBelowPovertyLevel.csv', encoding="windows-1252")

percent_over_25_completed_highSchool = pd.read_csv('../input/fatal-police-shootings-in-the-us/PercentOver25CompletedHighSchool.csv', encoding="windows-1252")

police_killings = pd.read_csv('../input/fatal-police-shootings-in-the-us/PoliceKillingsUS.csv', encoding="windows-1252")

share_race_city = pd.read_csv('../input/fatal-police-shootings-in-the-us/ShareRaceByCity.csv', encoding="windows-1252")

In [None]:
percentage_ppl_below_poverty.head()

In [None]:
percentage_ppl_below_poverty.info()

In [None]:
percentage_ppl_below_poverty.poverty_rate.value_counts()

In [None]:
# we dont know what the 201 values are, so we change them with zero
# since the purpose of this course is to understand visualization.

percentage_ppl_below_poverty.poverty_rate.replace(["-"],0,inplace=True)
percentage_ppl_below_poverty.poverty_rate.value_counts()

In [None]:
# transforming poverty_rate values from string to float

percentage_ppl_below_poverty.poverty_rate = percentage_ppl_below_poverty.poverty_rate.astype(float)

In [None]:
percentage_ppl_below_poverty["Geographic Area"].unique()

In [None]:
area_list = list(percentage_ppl_below_poverty["Geographic Area"].unique())

In [None]:
area_poverty_ratio = []

In [None]:
for i in area_list:
    x = percentage_ppl_below_poverty[percentage_ppl_below_poverty["Geographic Area"]==i]
    area_poverty_rate = sum(x.poverty_rate)/len(x)
    area_poverty_ratio.append(area_poverty_rate)

In [None]:
data = pd.DataFrame({"area_list": area_list, "area_poverty_ratio":area_poverty_ratio})
new_index = (data["area_poverty_ratio"].sort_values(ascending=False)).index.values
sorted_data = data.reindex(new_index)

In [None]:
# visualizing poverty rate by state

plt.figure(figsize=(15,10))
ax= sns.barplot(x=sorted_data["area_list"], y=sorted_data["area_poverty_ratio"])
plt.xticks(rotation = 45)
plt.xlabel("States")
plt.ylabel("Poverty Rate")
plt.title("Poverty Rate by State")
plt.show()

In [None]:
police_killings.head()

In [None]:
police_killings.name.value_counts()

In [None]:
seperate = police_killings.name[police_killings.name != "TK TK"].str.split()
a,b = zip(*seperate)
name_list =a+b
name_count = Counter(name_list)
most_common_names = name_count.most_common(15)

In [None]:
# most common names which we want to visualize

most_common_names

In [None]:
x,y = zip(*most_common_names)
x,y = list(x),list(y)
# 
plt.figure(figsize=(15,10))
ax= sns.barplot(x=x, y=y,palette = sns.hls_palette(len(x)))
plt.xlabel('Name or Surname of killed people')
plt.ylabel('Frequency')
plt.title('Most common 15 Name or Surname of killed people')

In [None]:
percent_over_25_completed_highSchool.info()

In [None]:
percent_over_25_completed_highSchool.percent_completed_hs.value_counts()

In [None]:
# High school graduates of the population older than 25, by state

percent_over_25_completed_highSchool.percent_completed_hs.replace(['-'],0.0,inplace = True)
percent_over_25_completed_highSchool.percent_completed_hs = percent_over_25_completed_highSchool.percent_completed_hs.astype(float)
area_list = list(percent_over_25_completed_highSchool["Geographic Area"].unique())
area_highschool = []
for i in area_list:
    x = percent_over_25_completed_highSchool[percent_over_25_completed_highSchool["Geographic Area"]==i]
    area_highschool_rate = sum(x.percent_completed_hs)/len(x)
    area_highschool.append(area_highschool_rate)
    
# sorting our data:
    
data = pd.DataFrame({"area_list": area_list, "area_highschool_ratio":area_highschool})
new_index = (data["area_highschool_ratio"].sort_values(ascending=False)).index.values
sorted_data2 = data.reindex(new_index)

# plotting our data:

plt.figure(figsize=(15,10))
sns.barplot(x=sorted_data2["area_list"],y=sorted_data2["area_highschool_ratio"])
plt.xticks(rotation = 45)
plt.xlabel("State")
plt.ylabel("High school graduation rate")
plt.title("High school graduates of the population older than 25, by state")

In [None]:
# Percentage of each states population in relation to race

share_race_city.head()

In [None]:
share_race_city.info()

In [None]:
share_race_city.value_counts

In [None]:
share_race_city.isnull()

In [None]:
share_race_city.share_hispanic

In [None]:
# Races of the USA, by state

share_race_city.replace(['-'],0.0,inplace = True)
share_race_city.replace(['(X)'],0.0,inplace = True)
share_race_city.loc[:,['share_white','share_black','share_native_american','share_asian','share_hispanic']] = share_race_city.loc[:,['share_white','share_black','share_native_american','share_asian','share_hispanic']].astype(float)
area_list = list(share_race_city['Geographic area'].unique())
share_white = []
share_black = []
share_native_american = []
share_asian = []
share_hispanic = []
for i in area_list:
    x = share_race_city[share_race_city['Geographic area']==i]
    share_white.append(sum(x.share_white)/len(x))
    share_black.append(sum(x.share_black) / len(x))
    share_native_american.append(sum(x.share_native_american) / len(x))
    share_asian.append(sum(x.share_asian) / len(x))
    share_hispanic.append(sum(x.share_hispanic) / len(x))

In [None]:
# visualization
# finding average population of each race by state

f,ax = plt.subplots(figsize = (9,15))
sns.barplot(x=share_white,y=area_list,color='green',alpha = 0.5,label='White' )
sns.barplot(x=share_black,y=area_list,color='blue',alpha = 0.7,label='African American')
sns.barplot(x=share_native_american,y=area_list,color='cyan',alpha = 0.6,label='Native American')
sns.barplot(x=share_asian,y=area_list,color='yellow',alpha = 0.6,label='Asian')
sns.barplot(x=share_hispanic,y=area_list,color='red',alpha = 0.6,label='Hispanic')

ax.legend(loc='lower right',frameon = True)     # legendlarin gorunurlugu
ax.set(xlabel='Percentage of Races', ylabel='States',title = "Percentage of State's Population According to Races ")

In [None]:
sorted_data.head()

In [None]:
sorted_data2.head()

In [None]:
# high school graduation rate vs Poverty rate of each state

sorted_data['area_poverty_ratio'] = sorted_data['area_poverty_ratio']/max( sorted_data['area_poverty_ratio'])
sorted_data2['area_highschool_ratio'] = sorted_data2['area_highschool_ratio']/max( sorted_data2['area_highschool_ratio'])
data = pd.concat([sorted_data,sorted_data2['area_highschool_ratio']],axis=1)
data.sort_values('area_poverty_ratio',inplace=True)

# visualize

f,ax1 = plt.subplots(figsize =(20,10))

sns.pointplot(x='area_list',y='area_poverty_ratio',data=data,color='black',alpha=0.8)
sns.pointplot(x='area_list',y='area_highschool_ratio',data=data,color='red',alpha=0.8)

plt.text(40,0.6,'high school graduation ratio',color='red',fontsize = 17,style = 'italic')
plt.text(40,0.55,'poverty ratio',color='black',fontsize = 18,style = 'italic')
plt.xlabel('States',fontsize = 15,color='blue')
plt.ylabel('Values',fontsize = 15,color='blue')
plt.title('High School Graduate  VS  Poverty Rate',fontsize = 20,color='blue')
plt.grid()

# note: writing data=data makes seaborn understand feature names easier

In [None]:
# same visualization with different style of seaborn code
# joint kernel density
# pearsonr= if it is 1, there is positive correlation and if it is, -1 there is negative correlation.
# If it is zero, there is no correlation between variables
# Show the joint distribution using kernel density estimation 
g = sns.jointplot(data.area_poverty_ratio, data.area_highschool_ratio, kind="kde", size=7)

plt.show()

In [None]:
# you can change parameters of joint plot
# kind : { “scatter” | “reg” | “resid” | “kde” | “hex” }
# Different usage of parameters but same plot with previous one
g = sns.jointplot("area_poverty_ratio", "area_highschool_ratio", data=data,size=5, ratio=3, color="r")

In [None]:
# you can change parameters of joint plot
# kind : { “scatter” | “reg” | “resid” | “kde” | “hex” }
# Different usage of parameters but same plot with previous one
g = sns.jointplot("area_poverty_ratio", "area_highschool_ratio", data=data,size=5, ratio=3, color="r")

In [None]:
police_killings

In [None]:
police_killings.value_counts()

In [None]:
police_killings.race.value_counts()

In [None]:
police_killings.race.dropna(inplace = True)
labels = police_killings.race.value_counts().index
colors = ['grey','blue','red','yellow','green','brown']
explode = [0,0,0,0,0,0]
sizes = police_killings.race.value_counts().values

# visual
plt.figure(figsize = (7,7))
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%')
plt.title('Killed People According to Races',color = 'blue',fontsize = 15)

In [None]:
# one again, high school graduation rate vs Poverty rate
# Show the results of a linear regression within each dataset
sns.lmplot(x="area_poverty_ratio", y="area_highschool_ratio", data=data)
plt.show()

In [None]:
data.head()

In [None]:
# cubehelix plot
sns.kdeplot(data.area_poverty_ratio, data.area_highschool_ratio, shade=True, cut=3)
plt.show()

In [None]:
pal = sns.cubehelix_palette(2, rot=-.5, dark=.3)
sns.violinplot(data=data, palette=pal, inner="points")
plt.show()

In [None]:
data.corr()

In [None]:
f,ax = plt.subplots(figsize=(5, 5))
sns.heatmap(data.corr(), annot=True, linewidths=0.5,linecolor="red", fmt= '.1f',ax=ax)
plt.show()

In [None]:
police_killings.manner_of_death.unique()

In [None]:
sns.boxplot(x="gender", y="age", hue="manner_of_death", data=police_killings, palette="PRGn")
plt.show()

In [None]:
sns.swarmplot(x="gender", y="age",hue="manner_of_death", data=police_killings)
plt.show()

In [None]:
sns.pairplot(data)
plt.show()

In [None]:
police_killings.gender.value_counts()

In [None]:
sns.countplot(police_killings.gender)
plt.title("gender",color = 'blue',fontsize=15)
plt.show()

In [None]:
armed = police_killings.armed.value_counts()
plt.figure(figsize=(10,7))
sns.barplot(x=armed[:7].index,y=armed[:7].values)
plt.ylabel('Number of Weapon')
plt.xlabel('Weapon Types')
plt.title('Kill weapon',color = 'blue',fontsize=15)

In [None]:
above25 =['above25' if i >= 25 else 'below25' for i in police_killings.age]
df = pd.DataFrame({'age':above25})
sns.countplot(x=df.age)
plt.ylabel('Number of Killed People')
plt.title('Age of killed people',color = 'blue',fontsize=15)

In [None]:
sns.countplot(data = police_killings, x='race')
plt.title('Race of killed people',color = 'blue',fontsize=15)

In [None]:
city = police_killings.city.value_counts()
plt.figure(figsize=(10,7))
sns.barplot(x=city[:12].index,y=city[:12].values)
plt.xticks(rotation=45)
plt.title('Most dangerous cities',color = 'blue',fontsize=15)

In [None]:
state = police_killings.state.value_counts()
plt.figure(figsize=(10,7))
sns.barplot(x=state[:20].index,y=state[:20].values)
plt.title('Most dangerous state',color = 'blue',fontsize=15)