## Importing Libraries

In [None]:
import pandas as pd
import numpy as np

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams["figure.figsize"] = (15,5) 

## Import the CSV file

In [None]:
df = pd.read_csv("../input/database.csv")

## Basic Exploration

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.set_index(["Record ID"],inplace = True)

In [None]:
df.head()

In [None]:
df.apply(lambda x: sum(x.isnull()),axis=0)

Our dataset seems really clean, without any missing values, which is wonderful!

In [None]:
df.describe()

## Weapons Used

In [None]:
df["Weapon"].head()

Can the scale we deploy for graphics, affect our perception of the nature of the data? Let's check different configurations.

In [None]:
plt.rcParams["figure.figsize"] = (12,4)
df["Weapon"].value_counts().plot(kind = "bar")
plt.title('Deaths Attributable to Various Weapons')

In [None]:
plt.rcParams["figure.figsize"] = (10,10)
df["Weapon"].value_counts().plot(kind = "bar")
plt.title('Deaths Attributable to Various Weapons')

In [None]:
plt.rcParams["figure.figsize"] = (12,4)
plt.yscale('log', nonposy='clip')
df["Weapon"].value_counts().plot(kind = "bar")
plt.title('Deaths Attributable to Various Weapons')

## Unsolved Crimes

Let's pay some attention to unsolved crimes. What are their characteristics?

In [None]:
df[df["Crime Solved"] != "Yes"].shape

In [None]:
unsolved = df[df["Crime Solved"] != "Yes"]

In [None]:
unsolved.head()

In [None]:
unsolved.describe()

In [None]:
plt.rcParams["figure.figsize"] = (12,4)
unsolved['Year'].value_counts().sort_index(ascending=True).plot(kind='line')
plt.title('Number of Unsolved Homicides: 1980 to 2014')

In [None]:
dict_states = {'Alaska':'AK','Alabama':'AL','Arkansas':'AR','Arizona':'AZ', 'California':'CA', 'Colorado':'CO', 'Connecticut':'CT', 
'District of Columbia':'DC', 'Delaware':'DE', 'Florida':'FL', 'Georgia':'GA', 'Hawaii':'HI', 'Iowa':'IA', 
'Idaho':'ID', 'Illinois':'IL', 'Indiana':'IN', 'Kansas':'KS', 'Kentucky':'KY', 'Louisiana':'LA', 
'Massachusetts':'MA', 'Maryland':'MD', 'Maine':'ME', 'Michigan':'MI', 'Minnesota':'MN', 'Missouri':'MO', 
'Mississippi':'MS', 'Montana':'MT', 'North Carolina':'NC', 'North Dakota':'ND', 'Nebraska':'NE', 
'New Hampshire':'NH', 'New Jersey':'NJ', 'New Mexico':'NM', 'Nevada':'NV', 'New York':'NY', 'Ohio':'OH', 
'Oklahoma':'OK', 'Oregon':'OR', 'Pennsylvania':'PA', 'Puerto Rico':'PR', 'Rhode Island':'RI', 
'South Carolina':'SC', 'South Dakota':'SD', 'Tennessee':'TN', 'Texas':'TX', 'Utah':'UT', 
'Virginia':'VA', 'Vermont':'VT', 'Washington':'WA', 'Wisconsin':'WI', 'West Virginia':'WV', 'Wyoming':'WY'}

In [None]:
abb_st = [val for val in dict_states.values()]    
len(abb_st)

In [None]:
plt.rcParams["figure.figsize"] = (12,4)
ax = sns.countplot(x="State", hue="Weapon", data=unsolved[unsolved["Weapon"]=="Handgun"])
ax.set_xticklabels(abb_st)
plt.title("Unsolved Homicides Caused By Handguns")

In [None]:
unsolved['Weapon'].value_counts()

In [None]:
plt.rcParams["figure.figsize"] = (12,4)

In [None]:
unsolved['Weapon'].value_counts().plot(kind='bar')

In [None]:
rel = unsolved['Weapon'].groupby(unsolved['Victim Sex'])

In [None]:
rel.size().plot(kind='bar')

Significant majority of victims in unsolved homicides are males.

## Month

In [None]:
unsolved["Month"].value_counts().plot(kind="bar")

## Agency Type
What kind of agencies contribute to the unsolved homicide statistics? 

In [None]:
unsolved["Agency Type"].value_counts().plot(kind="bar")
#plt.yscale('log', nonposy='clip')

## Removing Death by Negligence

Let's remove accidental deaths

In [None]:
unsolved["Crime Type"].unique()

Where are potential serial killers hiding? In plain sight in large cities, or in small towns?

In [None]:
pot_sk = unsolved[unsolved["Crime Type"] == "Murder or Manslaughter"]
pot_sk.head()

In [None]:
pot_sk.shape

In [None]:
pot_sk["City"].value_counts().head(10).plot(kind="bar")
plt.title("Top 10 Cities: Unsolved Murders or Manslaughters")

Some of the smaller cities have just 1 unsolved homicide. Serial Killers are defined as those having atleast 3 [victims](https://books.google.com/books?id=0yfoJz6jHwkC&pg=PA1#v=onepage&q&f=false). Let's put the threshold at 5 unsolved for the city.

In [None]:
pot_sk["City"].value_counts().tail(10).plot(kind="bar")
plt.title("Bottom 10 Cities: Unsolved Murders or Manslaughters")

pot_sk.groupby("City").filter(lambda x: len(x)>5)

In [None]:
two_or_more = pot_sk.groupby("City").filter(lambda x: len(x)>5)
two_or_more["City"].value_counts().tail(10).plot(kind="bar")

## Exploring Relationship Between Victims and Perpetrators

In [None]:
df["Relationship"].unique()

In [None]:
known = df[df["Relationship"] != "Unknown"]
known.head()

In [None]:
known["Relationship"].value_counts()

In [None]:
plt.rcParams["figure.figsize"] = (12,4)
known["Relationship"].value_counts().plot(kind="bar")
plt.title("Relationsip of Victim to Perpetrator")
plt.yscale('log', nonposy='clip')

## Race
Is there a racial angle to these homicides?

In [None]:
df.head(2)

In [None]:
df["Perpetrator Race"].unique()

In [None]:
df.columns

In [None]:
pd.pivot_table(known,index=["Victim Race","Perpetrator Race"],values=["Victim Count"],aggfunc=[np.sum])
               #columns=["Product"],aggfunc=[np.sum])

It looks like most people are killed by people from their own racial background.