In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_style("whitegrid")

import os

files = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        files.append(os.path.join(dirname, filename))

hdi, df = [pd.read_csv(file) for file in files]

# HDI was shared by the United Nations Development Programme.
# Getting rid of unnecessary columns
hdi = hdi[:187].drop(columns=['HDI Rank (2018)'])

In [None]:
df[df.isna()].count()

### Suicides by age groups ###

In [None]:
by_age = df.groupby(by=['age', 'sex'])['suicides_no']\
                                        .sum()\
                                        .reset_index(name='count')\
                                        .sort_values(by='count', ascending=False)
plt.figure(figsize=(15,10))
sns.barplot(x='age', y='count', hue='sex', data=by_age)

plt.xlabel('Age',size=14); plt.ylabel('Count', size=14)
plt.xticks(size=15); plt.yticks(size=15, rotation=90)
plt.show()

### What is the age distribution in countries with the highest suicide rates? ###

In [None]:
subset = df[['country', 'age', 'suicides/100k pop']]
grouped = subset.groupby(['country', 'age'])['suicides/100k pop']\
                                                                .mean()\
                                                                .reset_index(name='rates')
top = grouped.groupby(['country']).mean().sort_values(by='rates', ascending=False)
top = top[:10].index


grouped = grouped[grouped['country'].isin(top)]

plt.figure(figsize=(20,10))
sns.barplot(x='country', y='rates', order=top, hue='age', data=grouped)
plt.xticks(size=14, rotation=90)
plt.xlabel('Country', size=14)
plt.ylabel('Suicides/100k pop(mean over the years)', size=14)
plt.show()

### How has the suicide rate changed for different age groups over the years ###

In [None]:
grouped = df.groupby(['year', 'age'])['suicides/100k pop'].mean().reset_index()

plt.figure(figsize=(15,10))
sns.lineplot(x='year', y='suicides/100k pop', hue='age', data=grouped)
plt.xticks(grouped['year'].unique(), rotation=90)
plt.show()

### How has the suicide rates changed over the years for the top 10 countries with the highest of suicide rates ###

In [None]:
top = df.groupby('country')['suicides/100k pop'].mean().sort_values(ascending=False)
top_10 = top[:10].reset_index()

top_countries = df[df['country'].isin(top_10['country'].tolist())]
grouped = top_countries.groupby(['country', 'year', 'age'])['suicides/100k pop'].mean().reset_index()

fig, axs = plt.subplots(5, 2, figsize=(15, 30), sharex=True, dpi=100)
axs = [x for a in axs for x in a]
fig.autofmt_xdate(rotation=90)


for i, country in enumerate(grouped.country.unique()):
    filtered = grouped[grouped['country'] == country].drop(columns=['country'])
    sns.scatterplot(x='year', y='suicides/100k pop', hue='age', data=filtered, ax=axs[i])
    
    
    axs[i].set_title(country)

### How has it changed in contrast to HDI ###

In [None]:
def get_hdi(input):
    country, year = input.country, str(input.year)
    if year >= '1990':
        return np.float64(hdi[hdi['Country'] ==  country][year].values[0])
    return np.nan

In [None]:
subset = df[['country', 'year', 'suicides/100k pop']].reset_index(drop=True)
subset = subset[subset['country'].isin(top_10['country'].tolist())]

subset.reset_index(drop=True, inplace=True)
subset['HDI'] = subset[['country', 'year']].agg(get_hdi, axis=1)

grouped = subset\
                .groupby(['country', 'year'])[['suicides/100k pop', 'HDI']]\
                .mean()\
                .reset_index()
grouped.head()

In [None]:
fig, axs = plt.subplots(5, 2, figsize=(20, 30), sharex=False, sharey=True, dpi=100)
axs = [x for a in axs for x in a]
fig.autofmt_xdate(rotation=90)


for i, country in enumerate(subset.country.unique()):
    filtered = grouped[grouped['country'] == country]
    
    g1 = filtered.drop(columns=['country', 'HDI'])
    g2 = filtered.drop(columns=['country', 'suicides/100k pop'])
    
    axs[i].plot(g1['year'], g1['suicides/100k pop'], label='suicides', scalex=True, scaley=True)
    ax2 = axs[i].twinx()
    ax2.plot(g2['year'], g2['HDI'], label='HDI', color='orange')

    axs[i].legend()
    ax2.legend()
    axs[i].set_title(country)

### How has it changed in contrast to GDP per capita ###

In [None]:
subset = df[['country', 'year', 'suicides/100k pop', 'gdp_per_capita ($)']]
subset = subset[subset['country'].isin(top_10['country'].tolist())]

grouped = subset.groupby(['country', 'year'])
grouped = grouped[['suicides/100k pop', 'gdp_per_capita ($)']].mean().reset_index()

fig, axs = plt.subplots(5, 2, figsize=(20, 30), sharey=True, dpi=100)
axs = [x for a in axs for x in a]
fig.autofmt_xdate(rotation=90)

for i, country in enumerate(subset.country.unique()):
    filtered = grouped[grouped['country'] == country]
    
    g1 = filtered.drop(columns=['country', 'gdp_per_capita ($)'])
    g2 = filtered.drop(columns=['country', 'suicides/100k pop'])
    
    ax2 = axs[i].twinx()
    axs[i].plot(g1['year'], g1['suicides/100k pop'], label='suicides', scalex=True, scaley=True)
    ax2.plot(g2['year'], g2['gdp_per_capita ($)'], label='GDP per capita', color='orange')

    axs[i].legend()
    ax2.legend()
    axs[i].set_title(country)

### The highest suicide rate spikes over the years ###

In [None]:
def join(input):
    return ''.join((str(val) for val in input))

In [None]:
subset = df[['country', 'year', 'suicides/100k pop']]
subset = subset.groupby(['country', 'year']).mean().reset_index()
subset['diff'] = subset['suicides/100k pop'].diff()

# removing the first occurance of countries
for country in subset.country.unique():
    index = subset[subset['country'] == country].index[0]
    subset.drop(index, inplace=True)
    
subset = subset.sort_values(by='diff', ascending=False).reset_index(drop=True)

names = subset[['country', 'year']].agg(join, axis=1)
subset = subset.drop(columns=['country', 'year'])
subset.insert(0, 'country-year', names)

In [None]:
# ignoring the first two spikes
# assuming the increase from 0 to 118 in 2005
# and from 0 to 121 in 2007 in Montenegro is because of missing data
top_peaks = subset[2:22]

labels = [f'{val[:-4]} ({val[-4:]})' for val in top_peaks['country-year']]

fig, ax = plt.subplots(figsize=(15,10))
fig.autofmt_xdate(rotation=90)


sns.barplot(x='country-year', y='diff', data=top_peaks, ax=ax)
ax.set_xticklabels(labels, fontdict={'horizontalalignment': 'center'})
plt.show()


### Top countries in female suicides ###

In [None]:
grouped = df.groupby(['country', 'sex', 'age'])['suicides/100k pop']\
                                                    .mean()\
                                                    .reset_index(name='suicides/100k pop')
female = grouped[grouped['sex'] == 'female']

top_female = female.groupby(['country', 'sex'])['suicides/100k pop']\
                                                    .mean()\
                                                    .sort_values(ascending=False)\
                                                    .reset_index()
top_female = top_female[:20]
top_female[:3]

In [None]:
countries = top_female[:20]['country'].tolist()
comp = female[female['country'].isin(countries)]

plt.figure(figsize=(20,10))
sns.barplot(x='country', y='suicides/100k pop', hue='age', order=countries, data=comp)
plt.xticks(rotation=90)
plt.show()

### Suicide rates amongst generations ###

In [None]:
data = df.groupby(['generation', 'sex'])\
                                        ['suicides/100k pop']\
                                        .mean()\
                                        .reset_index(name='rates')
data = data[~(data['generation'] == 'G.I. Generation')]

plt.figure(figsize=(15, 10))
sns.barplot(x='generation', y='rates', hue='sex', hue_order=['male', 'female'], data=data)