In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [None]:
# reading our data

income = pd.read_csv("../input/fatal-police-shootings-in-the-us/MedianHouseholdIncome2015.csv")
highschool = pd.read_csv("../input/fatal-police-shootings-in-the-us/PercentOver25CompletedHighSchool.csv")
poverty = pd.read_csv("../input/fatal-police-shootings-in-the-us/PercentagePeopleBelowPovertyLevel.csv")
race = pd.read_csv("../input/fatal-police-shootings-in-the-us/ShareRaceByCity.csv")
kill = pd.read_csv("../input/fatal-police-shootings-in-the-us/PoliceKillingsUS.csv")

In [None]:
poverty["Geographic Area"].unique()

In [None]:
poverty.head()

In [None]:
poverty.poverty_rate.replace(["-"],0.0,inplace = True)     # converting missing values to 0

poverty.poverty_rate = poverty.poverty_rate.astype(float)  # Converting one column from string to float 
#  poverty.poverty_rate.dtype                              # checking datatype

area_list = list(poverty["Geographic Area"].unique())      # All Unique Value converted into list
area_poverty_ratio = []                                    # Make an empty variable
for i in area_list:
    x = poverty[poverty['Geographic Area']==i]             # loop over all the unique values 
    area_poverty_rate = sum(x.poverty_rate)/len(x)         # sum of poverty rate in all cities in one area/number of cities
    area_poverty_ratio.append(area_poverty_rate)           # put all geographical area in above empty list

# Indexing and Sorting    
data = pd.DataFrame({'area_list': area_list,'area_poverty_ratio':area_poverty_ratio}) # make dataFrame as per above information
new_index = (data['area_poverty_ratio'].sort_values(ascending=False)).index.values
sorted_data = data.reindex(new_index)

#Visualization
plt.figure(figsize=(15,10))
sns.barplot(x=sorted_data["area_list"], y=sorted_data["area_poverty_ratio"])
plt.xticks(rotation=45)
plt.show()

In [None]:
kill.head()

In [None]:
separate = kill.name[kill.name != "TK TK"].str.split()  # First we remove "TK Tk" from our name list and split other names
a,b = zip(*separate)                                    # separating the names, Fname in A, Lname in B
name_list = a+b                                         # All names
name_count = Counter(name_list)                         # Count number of all names (from collection import Counter)
most_common_names = name_count.most_common(15)          # 15 most common names and their counts
x,y = zip(*most_common_names)                           # Separate names and their count to put in graph
x, y = list(x), list(y)                                 # names on X asix, frequency on Y-axis

# Visualisation

plt.figure(figsize=(15,10))
sns.barplot(x=x, y=y)                                   # seaborn bar plot
plt.ylabel("Frequency", fontsize=20)
plt.xlabel("Name or Surname of killed people", fontsize=20)
plt.title('15 Most common Name/Surname of killed people', fontsize=30)
plt.xticks(rotation=45)
plt.show()

In [None]:
print(highschool.head())
print("")
print("")
print(highschool.info())
print("")
print("")
print(highschool.value_counts())
print("")
print("")
print(highschool.isnull().sum())


In [None]:
highschool.percent_completed_hs.replace(["-"], 0.0, inplace=True)                # replace all - values to 0
highschool.percent_completed_hs = highschool.percent_completed_hs.astype(float)  # change datatype to float
area_list = list(highschool["Geographic Area"].unique())                         # make a list of all unique values of Geographic Area colum
empty = []                                                                       # make an empty list
for i in area_list:
    x = highschool[highschool["Geographic Area"] == i]                           # loop and select data which are equal to each of area list
    area_school_rate = sum(x.percent_completed_hs)/(len(x.percent_completed_hs)) # calculate the average rate
    empty.append(area_school_rate)                                               # assignned all the values to our empty list

# Sorting
data = pd.DataFrame({"area_list" : area_list, "area_highschool_ratio" : empty})
new_index = (data["area_highschool_ratio"].sort_values(ascending=True)).index.values
sorted_data2 = data.reindex(new_index)

# Visualization

plt.figure(figsize=(15,10))
ax = sns.barplot(x= sorted_data2.area_list, y=sorted_data2.area_highschool_ratio)
plt.ylabel("High School Graduate Rate", fontsize=20)
plt.xlabel("States", fontsize=20)
plt.title("Percentage of Given State's Population Above 25 that Has Graduated High School", fontsize=20)
plt.xticks(rotation=45)
plt.show()

In [None]:
print(race.head())
print("")
print("")
print("")
print(race.info())
print("")
print("")
print("")
print(race.City.value_counts())
print("")
print("")
print("")
print(race.isnull().sum())


In [None]:
race.replace(["-"], 0.0, inplace=True)                     # replace  all - values to 0
race.replace(["(X)"], 0.0, inplace=True)                   # replace ("X") values to 0
race.loc[:,['share_white','share_black','share_native_american','share_asian','share_hispanic']] = race.loc[:,['share_white','share_black','share_native_american','share_asian','share_hispanic']].astype(float)
x = list(race["Geographic area"].unique())                # making a list of all unique values

# make empty lists for all races
share_white = []
share_black = []
share_native_american = []
share_asian = []
share_hispanic = []

# calculating the percentage of race in every geographic area

for area in x:
    new_area = race[race["Geographic area"] == area]
    share_white.append(sum(new_area.share_white)/len(new_area))
    share_black.append(sum(new_area.share_black)/len(new_area))
    share_native_american.append(sum(new_area.share_native_american)/len(new_area))
    share_asian.append(sum(new_area.share_asian)/len(new_area))
    share_hispanic.append(sum(new_area.share_hispanic)/len(new_area))

#Visualization

f, ax = plt.subplots(figsize=(12, 15))
sns.barplot(x= share_white, y = x, label='White', color='red', alpha = 0.5)
sns.barplot(x= share_black, y = x, label='Black', color='blue', alpha = 0.5)
sns.barplot(x= share_native_american, y = x, label='Native American', color='yellow', alpha = 0.5)
sns.barplot(x= share_asian, y = x, label = 'Asian', color='purple', alpha = 0.5)
sns.barplot(x= share_hispanic, y = x, label='Hispanic', color='green', alpha = 0.5)
ax.legend(loc='lower right',frameon = True) 
plt.xlabel("Percentage of Races", fontsize=20)
plt.ylabel("States", fontsize=20)
plt.title("Percentage of State's Population According to Races", fontsize=20)
plt.show()

In [None]:
# Highschool rate vs poverty rate


#Calculating the rate from above sorted data
sorted_data["area_poverty_ratio"] = sorted_data["area_poverty_ratio"]/max(sorted_data["area_poverty_ratio"])
sorted_data2["area_highschool_ratio"] = sorted_data2["area_highschool_ratio"]/max(sorted_data2["area_highschool_ratio"])
#concatinate both data into one
data = pd.concat([sorted_data, sorted_data2["area_highschool_ratio"]], axis=1)
#sort data
data.sort_values("area_poverty_ratio", inplace=True)

# Visualization
f,ax1 = plt.subplots(figsize =(20,10))
sns.pointplot(x= data.area_list, y = data.area_poverty_ratio, color="green", alpha=0.8)
plt.text(40,0.6,'High School Graduate Ratio',color='purple',fontsize = 17,style = 'italic')
plt.text(40,0.55,'Poverty Ratio',color='green',fontsize = 18,style = 'italic')
sns.pointplot(x= data.area_list, y = data.area_highschool_ratio, color="purple", alpha=0.8)
plt.title('High School Graduate rate vs Poverty Rate',fontsize = 30,color='black')
plt.xlabel("Area", fontsize = 20)
plt.ylabel("Rate", fontsize =20)
plt.grid()


In [None]:
# Visualization of high school graduation rate vs Poverty rate of each state with different style of seaborn code
# joint kernel density
# pearsonr= if it is 1, there is positive correlation and if it is, -1 there is negative correlation.
# If it is zero, there is no correlation between variables
# Show the joint distribution using kernel density estimation 

plt.figure(figsize= (10, 15))
ax = sns.jointplot(data.area_poverty_ratio, data.area_highschool_ratio, kind="hex", size = 10, color = "red")
ax = sns.jointplot(data.area_poverty_ratio, data.area_highschool_ratio, kind="kde", size = 10, color = "green")
ax = sns.jointplot(data.area_poverty_ratio, data.area_highschool_ratio, size = 10)
plt.show()

In [None]:

#Pie chart
kill.race.dropna(inplace=True)                                # drop all the nill values
labels = kill.race.value_counts().index
sizes = kill.race.value_counts().values
colors = ['purple','blue','red','yellow','green','brown']
explode = [0,0,0.1,0,0,0]

# Visualization
plt.figure(figsize = (10,10))
plt.pie(sizes, labels= labels, colors=colors, autopct='%1.1f%%', explode=explode)
plt.title('Killed People According to Races',color = 'blue',fontsize = 25)
plt.show()


In [None]:
#LMPLOT# Visualization of high school graduation rate vs Poverty rate of each state with different style of seaborn code
# lmplot 
# Show the results of a linear regression within each dataset

sns.regplot(x="area_poverty_ratio", y="area_highschool_ratio", data=data)
sns.lmplot(x="area_poverty_ratio", y="area_highschool_ratio", data=data)
plt.show()



In [None]:
# Visualization of high school graduation rate vs Poverty rate of each state with different style of seaborn code
# cubehelix plot

sns.jointplot(x="area_poverty_ratio", y="area_highschool_ratio", data=data, kind="kde")


In [None]:
sns.kdeplot(data.area_poverty_ratio, data.area_highschool_ratio, shade=True, cut=3)
plt.show()

In [None]:
# Show each distribution with both violins and points
sns.violinplot(data=data, inner="points")
plt.show()

In [None]:
#correlation map
# Visualization of high school graduation rate vs Poverty rate of each state with different style of seaborn code
x = data.corr()
f,ax = plt.subplots(figsize=(10, 8))
sns.heatmap(x, annot=True, linewidth=0.5, linecolor="red", fmt= '.1f', ax=ax)

In [None]:
#box plot 
sns.boxplot(x = kill.gender, y = kill.age, hue = kill.manner_of_death, palette="PRGn")

In [None]:
#swarm plot
plt.figure(figsize=(10,10))
sns.swarmplot(x= kill.gender, y = kill.age, hue=kill.race)

In [None]:
#pairplot
sns.pairplot(data, height = 5)
plt.show()

In [None]:
#count plot
ax = sns.countplot(kill.race)
plt.title("Race",color = 'black',fontsize=20)
plt.show()


In [None]:
sns.countplot(kill.gender)
plt.title("Gender",color = 'black',fontsize=20)

In [None]:
sns.countplot(kill.manner_of_death)
plt.title("Manner of Death",color = 'black',fontsize=20)

In [None]:
armed = kill.armed.value_counts()

plt.figure(figsize=(7,5))
sns.barplot(x= armed[:7].index, y= armed[:7].values)
sns.barplot(x=armed[:7].index,y=armed[:7].values)
plt.ylabel('Number of Weapon')
plt.xlabel('Weapon Types')
plt.title('Kill weapon',color = 'black',fontsize=20)


In [None]:
cities = kill.city.value_counts()
plt.figure(figsize=(16, 8))
sns.barplot(x = cities[:15].index, y = cities[:15].values)
plt.xlabel("Cities" , fontsize = 20)
plt.ylabel("Number of Killing", fontsize = 20)
plt.title("Cities With Highest Number Of Killing", fontsize= 30)
plt.xticks(rotation=45)
plt.show()

In [None]:
states = kill.state.value_counts()
plt.figure(figsize=(16,8))
sns.barplot(x = states[:15].index, y= states[:15].values)
plt.xlabel("States" , fontsize = 20)
plt.ylabel("Number of Killing", fontsize = 20)
plt.title("States With Highest Number Of Killing", fontsize= 30)
plt.xticks(rotation=45)
plt.show()


In [None]:
mental_illness = kill.signs_of_mental_illness
plt.figure(figsize=(12,6))
sns.countplot(x = mental_illness)
plt.xlabel("Mental Illness Status" , fontsize = 20)
plt.ylabel("Number of Killing", fontsize = 20)
plt.title("Having Mental Illness of Not", fontsize= 30)
plt.xticks(rotation=45)
plt.show()

In [None]:
threat = kill.threat_level
plt.figure(figsize =(12,6))
sns.countplot(threat)
plt.xlabel("Attack Type" , fontsize = 20)
plt.ylabel("Number of Attack", fontsize = 20)
plt.title("Number of Attack by type", fontsize= 30)
plt.xticks(rotation=45)
plt.show()


In [None]:
flee = kill.flee
plt.figure(figsize=(12,6))
sns.countplot(flee)
plt.title("Flee Type", fontsize= 30)
plt.xticks(rotation=45)
plt.show()


In [None]:
body_cam = kill.body_camera
plt.figure(figsize=(12,6))
sns.countplot(body_cam)
plt.title('Having body cameras or not on Police',color = 'black',fontsize = 30)
plt.xticks(rotation=45)


