In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
data = pd.read_csv('../input/master.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
# HDI data was collected every 5 years at the beginning. Last 5 years are yearly values. 
data[['year','HDI for year']].dropna().drop_duplicates('year').sort_values('year')

In [None]:
sns.heatmap(data.corr())

Let's fill NAs in HDI column by country. First from future to past and then the other way around. 

In [None]:
data['HDI for year'] = data.groupby('country')['HDI for year'].transform(lambda x: x.fillna(method = 'bfill'))
data['HDI for year'] = data.groupby('country')['HDI for year'].transform(lambda x: x.fillna(method = 'ffill'))

Now we have much less NA values in the HDI column.

In [None]:
data.info()

We still have NAs because several countries don't have a single record for HDI. This is why fillna won't work.

In [None]:
data.groupby('country')['HDI for year'].count()[lambda x: x == 0]

Let's see how does the proportion of suicides look like by country, year, sex and generation split by sex. 

In [None]:
plt.figure(figsize = (15,7))
ax = plt.subplot(121)
data.groupby(['country-year','sex','generation'])['suicides/100k pop'].mean().unstack(1).nlargest(10, 'male').plot.barh(ax = ax)
plt.title('Top Suicide Rate for Males')
ax = plt.subplot(122)
data.groupby(['country-year','sex','generation'])['suicides/100k pop'].mean().unstack(1).nlargest(10, 'female').plot.barh(ax = ax)
plt.title('Top Suicide Rate for Females')
plt.tight_layout()

It looks like Hungary is one of the worst countries regarding suicides. Let's dive a little deeper into that. 

In [None]:
hung = data.query('country == "Hungary"')

In [None]:
sns.pairplot(hung)

In [None]:
hung.head()

In [None]:
hung.loc[:,'age'] = pd.Categorical(hung['age'], categories = hung['age'].unique()[[0,2,1,3,4,5]], ordered = True)

In [None]:
hung.info()

In [None]:
sns.barplot(data = hung, x = 'sex', y = 'suicides/100k pop', hue = 'age')

Interesting to see that more suicides per 100k occur in older ages and decreases with age too. It's true for both female and males. This may be due to suicides diluting into a bigger population, makes sense because as people get older the more they die so there are few left to commit suicide. This, plus a tendency to commiting suicide, it reflects on the proportion. Let's see how does the population in raw numbers behave.

In [None]:
sns.barplot(data = hung, x = 'sex', y = 'population', hue = 'age', estimator = np.sum)

In [None]:
sns.barplot(data = hung, x = 'sex', y = 'suicides_no', hue = 'age', estimator = np.sum)

This is in fact the case. The oldest population is in fact one of the smallest. We now may see if there's a pattern through the years.

In [None]:
# sns.catplot(kind = 'bar', 
#             data = hung, 
#             x = 'sex', 
#             y = 'suicides_no', 
#             hue = 'age', 
#             col = 'year', col_wrap = 3)
print('The above plot summarizes the data pretty well. The pattern is present throughout the years')

Let's summarize GDP/capita by year and plot it against suicides/100k.

In [None]:
hung.groupby('year')['gdp_per_capita ($)'].std().head()
# No variation through the year, can take first value of every year.

In [None]:
gdp_capita = hung.groupby('year')[['gdp_per_capita ($)']].first()

In [None]:
gdp_capita = gdp_capita.join(hung.groupby('year')['suicides_no'].sum())

In [None]:
gdp_capita.plot(kind = 'scatter', x = 'gdp_per_capita ($)', y = 'suicides_no')
for i in range(len(gdp_capita)):
    plt.annotate(gdp_capita.index[i], (gdp_capita.iloc[i,0],gdp_capita.iloc[i,1]))

In [None]:
gdp_capita.corr()