# INTRODUCTION
The goal of this notebook is to practice seaborn as visualization tool, which is the third step inside DATAI TEAM data science learning path. 

<font color="black">

* [Bar Plot](#1)
* [Horizontal Bar Plot](#2)
* [Point Plot](#3)
* [Joint Plot](#4)
* [Pie Plot](#5)
* [Lm Plot](#6)
* [Kde Plot](#7)
* [Violin Plot](#8)
* [Heatmap](#9)
* [Box Plot](#10)
* [Swarm Plot](#11)
* [Pair Plot](#12)
* [Count Plot](#13)



In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Read datas
median_house_hold_in_come = pd.read_csv('../input/fatal-police-shootings-in-the-us/PercentagePeopleBelowPovertyLevel.csv', encoding="windows-1252")
percentage_people_below_poverty_level = pd.read_csv('../input/fatal-police-shootings-in-the-us/PercentagePeopleBelowPovertyLevel.csv', encoding="windows-1252")
percent_over_25_completed_highSchool = pd.read_csv('../input/fatal-police-shootings-in-the-us/PercentOver25CompletedHighSchool.csv', encoding="windows-1252")
share_race_city = pd.read_csv('../input/fatal-police-shootings-in-the-us/ShareRaceByCity.csv', encoding="windows-1252")
kill = pd.read_csv('../input/fatal-police-shootings-in-the-us/PoliceKillingsUS.csv', encoding="windows-1252")

<a id="1"></a>
### Bar Plot

In [None]:
# replace "-" to "0.0" value, (inplace=True) directly saves into related dataset.
percentage_people_below_poverty_level.poverty_rate.replace(["-"],0.0,inplace=True)
# change type to float from object
percentage_people_below_poverty_level.poverty_rate = percentage_people_below_poverty_level.poverty_rate.astype(float)

# poverty rate of each state

# list unique geographic areas
area_list = list(percentage_people_below_poverty_level["Geographic Area"].unique())

area_poverty_ratio=[]
for i in area_list:
    x = percentage_people_below_poverty_level[percentage_people_below_poverty_level["Geographic Area"]==i]
    area_poverty_rate = sum(x.poverty_rate)/ len(x)
    area_poverty_ratio.append(area_poverty_rate)
data = pd.DataFrame({"area_list": area_list,"area_poverty_ratio": area_poverty_ratio})
new_index = (data["area_poverty_ratio"].sort_values(ascending=False)).index.values
sorted_data = data.reindex(new_index)

# visualization
plt.figure(figsize = (15,10))
sns.barplot(x = sorted_data["area_list"], y=sorted_data["area_poverty_ratio"])
plt.xticks(rotation = 45)
plt.xlabel("States")
plt.ylabel("Poverty Rate")
plt.title("Poverty Rate Given States")
plt.show()

In [None]:
# most common 15 name or surname of killed people
separate = kill.name[kill.name != "TK TK"].str.split()
a,b = zip(*separate)
name_list = a + b
name_count = Counter(name_list)
most_common_names = name_count.most_common(15)
x,y = zip(*most_common_names)
x,y = list(x),list(y)

# visualization
plt.figure(figsize=(15,10))
sns.barplot(x=x, y=y, palette=sns.cubehelix_palette(len(x)))
plt.xlabel("Name or Surname of killed people")
plt.ylabel("Frequency")
plt.title("Most common 15 Name or Surname of killed people")
plt.show()

In [None]:
# high school graduation rate of the population that is older than 25 in states
percent_over_25_completed_highSchool.percent_completed_hs.replace(["-"],0.0, inplace=True)
percent_over_25_completed_highSchool.percent_completed_hs = percent_over_25_completed_highSchool.percent_completed_hs.astype(float)
area_list=list(percent_over_25_completed_highSchool["Geographic Area"].unique())
area_highschool = []
for i in area_list:
    x = percent_over_25_completed_highSchool[percent_over_25_completed_highSchool["Geographic Area"]==i]
    area_highschool_rate = sum(x.percent_completed_hs)/len(x)
    area_highschool.append(area_highschool_rate)
    
# sorting
data = pd.DataFrame({"area_list": area_list,"area_highschool_ratio":area_highschool})
new_index = (data["area_highschool_ratio"].sort_values(ascending=True)).index.values
sorted_data2 = data.reindex(new_index)

#visualization
plt.figure(figsize=(15,10))
sns.barplot(x=sorted_data2["area_list"],y=sorted_data2["area_highschool_ratio"],palette=sns.cubehelix_palette(len(area_list)))
plt.xticks(rotation=45)
plt.xlabel("States")
plt.ylabel("High School Graduate Rate")
plt.title("Percentage of given state's population above 25 that has graduated from high school")
plt.show()

<a id="2"></a>
### Horizontal Bar Plot 

In [None]:
# percentage of state's population according to races that are black,white,native american,asian and hispanic
share_race_city.replace(["-"],0.0,inplace=True)
share_race_city.replace(["(X)"],0.0,inplace=True)
share_race_city.loc[:,["share_white","share_black","share_native_american","share_asian","share_hispanic"]] = share_race_city.loc[:,["share_white","share_black","share_native_american","share_asian","share_hispanic"]].astype(float)
area_list = list(share_race_city["Geographic area"].unique())
share_white = []
share_black = []
share_native_american = []
share_asian = []
share_hispanic = []
for i in area_list:
    x = share_race_city[share_race_city["Geographic area"]==i]
    share_white.append(sum(x.share_white)/len(x))
    share_black.append(sum(x.share_black)/len(x))
    share_native_american.append(sum(x.share_native_american)/len(x))
    share_asian.append(sum(x.share_asian)/len(x))
    share_hispanic.append(sum(x.share_hispanic)/len(x))
    
# visualization
f,ax = plt.subplots(figsize=(9,15))
sns.barplot(x = share_white, y =area_list, color="green", alpha=0.5, label="White")
sns.barplot(x = share_black, y =area_list, color="blue", alpha=0.5, label="African American")
sns.barplot(x = share_native_american, y =area_list, color="cyan", alpha=0.5, label="Native American")
sns.barplot(x = share_asian, y =area_list, color="yellow", alpha=0.5, label="Asian")
sns.barplot(x = share_hispanic, y =area_list, color="red", alpha=0.5, label="Hispanic")

ax.legend(loc="lower right",frameon=True) #frameon = True makes legend frame transparent
ax.set(xlabel="Percentage of Races", ylabel="States", title="Percentage of State's Population According to Races")
plt.show()


<a id="3"></a>
### Point Plot

In [None]:
# normalization is dividing to each of elements in list with max number to scale them in plot
# 0 < [1,2,3,4,5]/5 < 1
# 0 <[1000,900,800,700,600]/100 < 1

In [None]:
# High school graduation rate vs. Poverty rate of each state
sorted_data["area_poverty_ratio"] = sorted_data["area_poverty_ratio"] / max(sorted_data["area_poverty_ratio"])
sorted_data2["area_highschool_ratio"] = sorted_data2["area_highschool_ratio"] / max(sorted_data2["area_highschool_ratio"])
data = pd.concat([sorted_data,sorted_data2["area_highschool_ratio"]],axis=1)
data.sort_values("area_poverty_ratio",inplace=True)

# visualization
f,ax1 = plt.subplots(figsize=(20,10))
sns.pointplot(x="area_list",y="area_poverty_ratio",data=data,color="lime",alpha=0.8) # data=data enables to use columns as axis as list
sns.pointplot(x="area_list",y="area_highschool_ratio",data=data,color="red",alpha=0.8)
plt.text(40,0.6, "high school graduate ratio",color="red",fontsize=17,style="italic")
plt.text(40,0.55, "poverty ratio",color="lime",fontsize=18,style="italic")
plt.xlabel("States",fontsize=15,color="blue")
plt.ylabel("Values",fontsize=15,color="blue")
plt.title("High School Graduate VS Poverty Rate",fontsize=20,color="blue")
plt.grid()

<a id="4"></a>
### Joint Plot

In [None]:
# High school graduation rate vs. Poverty rate of each state with different style of seaborn code
# joint kernel density
# pearsonr= if it is 1, there is positive correlation and if it is -1, there is negative correlation.
# if it is zero, there is no correlation between variables
# show the joint distribution using kernel density estimation (kde)
g = sns.jointplot(data.area_poverty_ratio, data.area_highschool_ratio, kind="kde", height=7)
plt.savefig("graph.png")
plt.show()

In [None]:
# you can change parameters of joint plot
# kind: { "scatter" | "reg" | "resid" | "kde" | "hex"}
# different usage of parameters but same plot with previous one
g = sns.jointplot("area_poverty_ratio", "area_highschool_ratio", data=data, height=6, ratio=3, color="r")

<a id="5"></a>
### Pie Plot

In [None]:
# race rates according in kill data
kill.race.dropna(inplace=True)
labels = kill.race.value_counts().index
colors = ["grey","blue","red","yellow","green","brown"]
explode = [0,0,0,0,0,0]
sizes = kill.race.value_counts().values

# visualization
# autopct is to specify digit after dot in ratio
# explode is to move selected slices outward (please change explode value to [0.1,0,0,0.2,0,0.1] to check difference)
plt.figure(figsize=(7,7))
plt.pie(sizes,explode=explode, labels=labels, colors=colors,autopct="%1.1f%%")
plt.title("Killed People According to Races",color="blue",fontsize=15)
plt.show()

<a id="6"></a>
### Lm Plot

In [None]:
# High school graduation rate vs. Poverty rate of each state with different style of seaborn code
# show the results of a linear regression within each dataset
sns.lmplot(x="area_poverty_ratio", y="area_highschool_ratio",data=data)
plt.show()

<a id="7"></a>
### Kde Plot

In [None]:
# High school graduation rate vs. Poverty rate of each state with different style of seaborn code
# shade=True makes shade between circles, to check further shade=False and see circles as difference
# cut=2 is providing much detailed view according to cut=5, which is more smaller
sns.kdeplot(data.area_poverty_ratio, data.area_highschool_ratio, shade=True, cut=2)
plt.show()

<a id="8"></a>
### Violin Plot

In [None]:
# High school graduation rate vs. Poverty rate of each state
# show each distribution with both violins and points 
# use cubehelix to get a custom sequential pallete for coloring
# it takes numeric values from dataset(data) not categorical data
# inner="points": showing values as dots in inside violins
# shape becomes wider when values are more often in particular range 
# 2: number of color
# rot: rotations around the hue wheel over the range of the palette
# dark: intensity of the darkest color in the palette
pal = sns.cubehelix_palette(2, rot=.5, dark=.3)
sns.violinplot(data=data, palette=pal, inner="points")
plt.show()

<a id="9"></a>
### HeatMap

In [None]:
# High school graduation rate vs. Poverty rate of each state
# correlation map
# annot = True: show corr values on each frame 
# linewidths: width between frames
# fmt:digit after dot
# ax=ax: plot to be used in heatmap
f,ax = plt.subplots(figsize=(5,5))
sns.heatmap(data.corr(), annot=True, linecolor="red", linewidths=.5, fmt=".1f", ax=ax)
plt.show()

<a id="10"></a>
### Box Plot

In [None]:
# manner of death
# plot the orbital period with horizontal boxes
# x axis -> data.gender.unique() values
# hue: categorical variable as coloring for men and female / class
sns.boxplot(x="gender", y="age", hue="manner_of_death", data=kill, palette="PRGn")
plt.show()

<a id="11"></a>
### Swarm Plot

In [None]:
# manner of death
# if data is huge as greater than 10000, it is hard to use this plot
sns.swarmplot(x="gender", y="age", hue="manner_of_death", data=kill)
plt.show()

<a id="12"></a>
### Pair Plot

In [None]:
# High school graduation rate vs. Poverty rate of each state
# gives both of scatter plot and histogram for each column 
sns.pairplot(data)
plt.show()

<a id="13"></a>
### Count Plot


In [None]:
# manner of death
# visualize counts of column data values 
sns.countplot(kill.gender)
plt.title("gender", color="blue", fontsize=15)
plt.show()

In [None]:
# age of killed people
above25 = ["above 25" if i < 25 else "below 25" for i in kill.age]
df = pd.DataFrame({"age": above25})
sns.countplot(df.age)
plt.ylabel("Number of Killed People")
plt.title("Age of killed people", color="blue", fontsize=15)
plt.show()

In [None]:
# race of killed people
sns.countplot(x="race", data=kill, order=kill.race.value_counts().index)
plt.title("Race of killed people", color="blue", fontsize=15)
plt.show()