# Demographic analysis of Suicide Data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

data = pd.read_csv("../input/suicides-in-india/Suicides in India 2001-2012.csv")


In [None]:
data_states_grouped = data.groupby('State',as_index=False)['Total'].sum()
data_states_grouped = data_states_grouped[~data_states_grouped["State"].isin(["Total (All India)","Total (States)","Total (Uts)"])].sort_values(by="Total",ascending=False)
added_value = data_states_grouped[data_states_grouped["State"].isin(["Daman & Diu","D & N Haveli"])]["Total"].sum()
added_data = pd.DataFrame([["D & N Haveli & Daman",added_value]],columns = ["State","Total"])
data_states_grouped = data_states_grouped.append(added_data,ignore_index=True)
data_states_grouped = data_states_grouped[~data_states_grouped["State"].isin(["Daman & Diu","D & N Haveli"])]
plt.figure(figsize = (100,30))
fig = sns.barplot(data = data_states_grouped,x =  "State", y = "Total")
plt.title("Suicide count statewise from 2001 - 2012")
plt.xlabel("States")
plt.ylabel("Total")
plt.show(fig)

This chart represents the absolute suicide count per state. From the data we observe that the suicide rates are more in states whith higher population and this is befitting to the trend. So in order to capture the normalised suicide rate without taking population into account we must consider population density as well. This will help us focus on the actual suicide rate of a state.

$\mathbf{Pd}$ (Population Density) = $\mathbf{P}$ (Population) / $\mathbf{A}$ (Area)

Since $\mathbf{S}$ (Number of Suicides) $\propto$ $\mathbf{P}$

Hence $\mathbf{S}$ $\propto$ $\mathbf{Pd}$

Therefore $\mathbf{S}$ / $\mathbf{Pd}$ = $\mathbf{Sn}$ (Normalised Suicide Rate)

We will source the population density form the following url 

https://en.wikipedia.org/wiki/List_of_states_and_union_territories_of_India_by_population

In [None]:
states_data = pd.read_html("https://en.wikipedia.org/wiki/List_of_states_and_union_territories_of_India_by_population",header = 0)

Lets prepare this data and merge it with our existing table

In [None]:
pop_density = states_data[1].loc[:,["State or union territory","Density[a]"]]
pop_density["Density[a]"] = pop_density["Density[a]"].apply(lambda x : float(x.split("/")[0].replace(",","")))
pop_density = pop_density.iloc[0:-1,:]
sum_jk = pop_density[pop_density["State or union territory"].isin(["Ladakh","Jammu and Kashmir"])]["Density[a]"].sum()
sum_tel = pop_density[pop_density["State or union territory"].isin(["Andhra Pradesh","Telangana"])]["Density[a]"].sum()
pop_density.append(pd.DataFrame([["Jammu and Kashmir",sum_jk]],columns = ["State or union territory","Density[a]"]),ignore_index=True)
pop_density.append(pd.DataFrame([["Andhra Pradesh",sum_tel]],columns = ["State or union territory","Density[a]"]),ignore_index=True)
pop_density = pop_density[~pop_density["State or union territory"].isin(["Ladakh","Telangana"])]

In [None]:
states_map = {"Andaman and Nicobar Islands": "A & N Islands",
             "Dadra and Nagar Haveli and Daman and Diu":"D & N Haveli & Daman",
             "Manipur[d]":"Manipur",
             "Jammu and Kashmir" : "Jammu & Kashmir",
             "NCT of Delhi" : "Delhi (Ut)"}

In [None]:
pop_density["State or union territory"]=pop_density["State or union territory"].map(states_map,na_action='ignore').fillna(pop_density['State or union territory'])
pop_density = pop_density.sort_values(by="State or union territory",ascending=True)
data_states_grouped = data_states_grouped.sort_values(by="State",ascending=True)
data_states_grouped["Density"] = pop_density["Density[a]"]
data_states_grouped["Normal"] = data_states_grouped["Total"] / data_states_grouped["Density"]

In [None]:
data_states_grouped = data_states_grouped.sort_values(by="Total",ascending=False)

In [None]:
plt.figure(figsize = (100,30))
fig = sns.barplot(data = data_states_grouped,x =  "State", y = "Normal")
plt.title("Normalised Suicide count statewise from 2001 - 2012")
plt.xlabel("States")
plt.ylabel("Normalised Total")
plt.show(fig)

We can see that after normalisation and taking population density into account we get a better insight into the rate of suicides in a state.There is some discripancy in this data because the population density has been calculated as the two datasets are of different time frames but it gives us a general intuition that higher population density lower will be suicide rate. Hence the suicide rate is somewhat constant and doesnot depend on the region but on other factors that we will explore now.

# Gender Analysis of the suicide cases 

In [None]:
plt.figure(figsize = (100,30))
fig = sns.barplot(data = data.groupby(["Year","Gender"],as_index = False)['Total'].sum(),x =  "Year", y = "Total",hue = "Gender")
plt.title("Year wise gender analysis of Suicides from 2001 - 2012")
plt.xlabel("Year")
plt.ylabel("Total")
plt.show(fig)

The above chart tells us that the suicide rate has gradually increased on an average with the increase in years . Female suicides has remaied constant over the years and decreased slightly. Hoever male suicides have gradually increased.

# Age wise analysis of suicide cases

In [None]:
plt.figure(figsize = (100,30))
fig = sns.barplot(data = data.groupby(["Year","Age_group"],as_index = False)['Total'].sum(),x =  "Year", y = "Total",hue = "Age_group")
plt.title("Year wise Age group analysis of Suicides from 2001 - 2012")
plt.xlabel("Year")
plt.ylabel("Total")
plt.show(fig)

The above chart shows the age group of cases that have occured , suicides are most common in people aged 15 to 29 and 30 to 40 years of age

# Suicide type analysis 

The general data has been categorised into the following types 'Causes', 'Education_Status', 'Means_adopted','Professional_Profile', 'Social_Status'. Each of these types have several sub types associated which will let us analyse further in depth trends of these suicide cases.

Let us begin with the causes of suicides over time among men, women and different age groups

In [None]:
data["Type_code"].unique()

In [None]:
data_causes = data[data["Type_code"]  == "Causes"]

In [None]:
plt.figure(figsize = (100,30))
fig = sns.catplot(data = data_causes.groupby(["Year","Type","Gender"],as_index = False)['Total'].sum(),x =  "Year", y = "Total",hue = "Type",col= "Gender")
plt.title("Year wise Cause analysis of Suicides considering gender from 2001 - 2012")
plt.xlabel("Year")
plt.ylabel("Total")
fig.set_xticklabels(rotation=45)
plt.show(fig)

In [None]:
plt.figure(figsize = (100,30))
fig = sns.catplot(data = data_causes.groupby(["Year","Type","Age_group"],as_index = False)['Total'].sum(),x =  "Year", y = "Total",hue = "Type",col= "Age_group")
plt.title("Year wise Cause analysis of Suicides considering age group from 2001 - 2012")
plt.xlabel("Year")
plt.ylabel("Total")
fig.set_xticklabels(rotation=45)
plt.show(fig)

The above charts highlight the most common causes of suicide as grouped by gender and age

In [None]:
data_ed = data[data["Type_code"]  == "Education_Status"]

In [None]:
plt.figure(figsize = (100,30))
fig = sns.catplot(data = data_ed.groupby(["Year","Type","Gender"],as_index = False)['Total'].sum(),x =  "Year", y = "Total",hue = "Type",col= "Gender")
plt.title("Year wise Education analysis of Suicides considering gender from 2001 - 2012")
plt.xlabel("Year")
plt.ylabel("Total")
fig.set_xticklabels(rotation=45)
plt.show(fig)

This chart captures a really important trend , it shows us the less educated a person is more prone to suicide he is .

In [None]:
data_means = data[data["Type_code"]  == "Means_adopted"]

In [None]:
plt.figure(figsize = (100,30))
fig = sns.catplot(data = data_means.groupby(["Year","Type","Gender"],as_index = False)['Total'].sum(),x =  "Year", y = "Total",hue = "Type",col= "Gender")
plt.title("Year wise Means analysis of Suicides considering Gender from 2001 - 2012")
plt.xlabel("Year")
plt.ylabel("Total")
fig.set_xticklabels(rotation=45)
plt.show(fig)

In [None]:
plt.figure(figsize = (100,30))
fig = sns.catplot(data = data_means.groupby(["Year","Type","Age_group"],as_index = False)['Total'].sum(),x =  "Year", y = "Total",hue = "Type",col= "Age_group")
plt.title("Year wise Means analysis of Suicides considering Age group from 2001 - 2012")
plt.xlabel("Year")
plt.ylabel("Total")
fig.set_xticklabels(rotation=45)
plt.show(fig)

The above graphs clearly show us that suicide by hanging by far is the most common choice , followed by consuming insecticides and drowning

In [None]:
data_prod = data[data["Type_code"]  == "Professional_Profile"]

In [None]:
plt.figure(figsize = (100,30))
fig = sns.catplot(data = data_prod.groupby(["Year","Type","Gender"],as_index = False)['Total'].sum(),x =  "Year", y = "Total",hue = "Type",col= "Gender")
plt.title("Year wise Means analysis of Suicides considering Gender from 2001 - 2012")
plt.xlabel("Year")
plt.ylabel("Total")
fig.set_xticklabels(rotation=45)
plt.show(fig)

In [None]:
plt.figure(figsize = (100,30))
fig = sns.catplot(data = data_prod.groupby(["Year","Type","Age_group"],as_index = False)['Total'].sum(),x =  "Year", y = "Total",hue = "Type",col= "Age_group")
plt.title("Year wise Means analysis of Suicides considering Age group from 2001 - 2012")
plt.xlabel("Year")
plt.ylabel("Total")
fig.set_xticklabels(rotation=45)
plt.show(fig)

The above charts clearly show that housewives are the most common group for suicide among wommen and farmers among men. People with a decent livelihood are less prone to suicides

In [None]:
data_social = data[data["Type_code"]  == "Social_Status"]

In [None]:
plt.figure(figsize = (100,30))
fig = sns.catplot(data = data_social.groupby(["Year","Type","Gender"],as_index = False)['Total'].sum(),x =  "Year", y = "Total",hue = "Type",col= "Gender")
plt.title("Year wise Means analysis of Suicides considering Gender from 2001 - 2012")
plt.xlabel("Year")
plt.ylabel("Total")
fig.set_xticklabels(rotation=45)
plt.show(fig)

This chart clearly shows us that  people who are married are most common to commit suicide and then followed by people who never married 