In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter
import sklearn
%matplotlib inline
sns.set()

In [None]:
df = pd.read_csv('../input/who-suicide-statistics/who_suicide_statistics.csv')
df

In [None]:
df.rename(columns = {'suicides_no': 'suicides'}, inplace = True)
df

### Observing the Data

In [None]:
df.info()

In [None]:
df.describe()

### Data Cleaning

In [None]:
df.head()

In [None]:
df = df.dropna()

### Exploratory Data Analysis

In [None]:
df['country'].nunique()

In [None]:
df['suicides'].sum()

#### Suicides per Year

In [None]:
TotalsbyYear = df.groupby('year')['suicides'].agg(np.sum)

fig, ax = plt.subplots()
ax.plot(TotalsbyYear)
ax.set_xlabel('Year')
ax.set_ylabel('Number of Suicides')
ax.set_title('Total Suicide Count 1979 - 2015')
plt.tight_layout()

#### Suicides per Year across Genders

In [None]:
year_x_gender = df.pivot_table(index = 'year', columns = 'sex', values = 'suicides', aggfunc = np.sum)
year_x_gender.head()

In [None]:
year_x_gender.plot(title = 'Suicides by Gender from 1979 - 2015',
                  ylabel = 'Suicides',
                  xlabel = 'Year')
plt.show()

#### Suicides by Gender across Age Groups

In [None]:
age_x_gender = df.pivot_table(index = 'age', columns = 'sex', values = 'suicides', aggfunc = np.sum)
age_x_gender

In [None]:
labels = age_x_gender.index
x = np.arange(len(labels))
width = 0.35 # width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, age_x_gender['female'], width, label = 'Female')
rects2 = ax.bar(x + width/2, age_x_gender['male'], width, label = 'Male')
ax.set_title("Total Suicides by Age Group and Gender")
ax.set_xticks(x)
ax.set_xticklabels(labels, rotation = '40')
ax.legend()

fig.tight_layout()
plt.show()

#### Total Suicides by Year across Age Groups

In [None]:
year_x_age = df.pivot_table(index = 'year', columns = 'age', values = 'suicides', aggfunc = np.sum)
year_x_age.head()

In [None]:
year_x_age.describe()

In [None]:
year_x_age.plot(title = 'Total Suicides by Age Group',
               xlabel = 'Year',
               ylabel = 'Total Suicides')
plt.legend(bbox_to_anchor=(1.025, 1), loc='upper left', borderaxespad=0.)
plt.show()

#### Total Suicides: Top 10 Countries

In [None]:
CountryTotals = df.groupby('country').sum()['suicides'].sort_values(ascending = False)
High = CountryTotals.head(10)

plt.bar(High.index, High)
plt.xticks(rotation = 'vertical')
plt.title("Top 10 Countries with Highest Suicides")
plt.xlabel('Countries')
plt.show()

#### Top 10 Countries: Suicide count over time

In [None]:
df.head()
countryPivot = df.pivot_table(columns = 'year', index = 'country', values = 'suicides')
countryPivot['TotalSuicides'] = countryPivot.sum(axis=1)
top10byYear = countryPivot.sort_values(by = 'TotalSuicides', ascending = False).head(10)
top10byYear.drop(columns = 'TotalSuicides', inplace = True)
toGraph = top10byYear.stack().unstack(0)
toGraph.sort_index(inplace = True)

In [None]:
fig, ax = plt.subplots()
ax.plot(toGraph)
ax.set_title("Top 10 Countries: Suicide Count from 1979 - 2015")
ax.set_xlabel("Year")
ax.set_ylabel("Suicides")
plt.tight_layout()

#### Top 10 Countries: Weighted Average Suicide Rates

In [None]:
suicideRates = df.groupby(['country','year'])[['suicides','population']].sum()
suicideRates['suicidepercentage'] = suicideRates['suicides'] / suicideRates['population'] * 100
percentage = suicideRates[['suicidepercentage']].unstack(0)
percentagefilled = percentage.fillna(percentage.mean()) ##filled missing values on average
Top10 = percentagefilled.stack().unstack(0)
Top10
Top10['avgPerc'] = Top10.mean(axis=1)
Top10WASR = Top10.sort_values(by = 'avgPerc', ascending = False).head(10)
Top10WASR['avgPerc'].plot.bar(title = 'Top 10 Countries by Weighted Average Suicide Rate',
                             xlabel = 'Country',
                             ylabel = 'Average suicide rate from 1979 - 2015')