In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns #for better and easier plots

#plotting directly without needing to call plot.show()
%matplotlib inline 

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [None]:
data = pd.read_csv("../input/master.csv", parse_dates=True) #loading the file, parse_dates = True

In [None]:
data.head() #checking the first 5 top entries

In [None]:
print(data.shape) #checking the shape of the data

In [None]:
data.groupby("year")['suicides_no'].sum().plot()#grouping by year, making it easier to look for trends
plt.title("Distribution of suicides by year", fontsize=20)

In [None]:
#let's use a countplot and group by countries, to have an idea of the distributionof suicide rates per country

sns.set(rc={'figure.figsize':(10,20)}) #setting the figure size
ax = sns.countplot(y="country", data=data.sort_values(ascending=False, by="suicides_no"))
#using countplot, assign country to y to make the plot horizontal
plt.yticks(fontsize=13) #rotating the labels to make it readable
plt.title("Suicide rates by countries", fontsize=20) #title

** We can have a good idea of the distribution, let's plot a bar one and sort by values to see the countries with the highest rate**

In [None]:
sns.set(rc={'figure.figsize':(22,4)}) #setting the figure size
by_country = data.groupby("country")['suicides_no'].sum()
by_country.sort_values(ascending=False).head(50).plot(kind='bar')#to make the chart clearer, gonna show only the top 50
#grouping by country, making it easier to look for trends
plt.xticks(fontsize=15)
plt.title("Distribution of suicides by country", fontsize=20)



** as can be seen, the top 3 countries with the highest suicide rate are Russia, United States and Japan, would be informative to explore this information in more detail.**

Before data, let's keep looking for correlations in our data and have some intuitions

In [None]:
ax = sns.countplot(x = "age", hue="generation", data=data )#using countplot, assign country to y to make the plot horizontal
ax = sns.set(rc={'figure.figsize':(10,4)})
plt.xticks(rotation = 45, fontsize=13) #rotating the labels to make it readable
plt.title("Suicide rates by Age", fontsize=15) #title

In [None]:
g = sns.FacetGrid(data, col="generation", hue="sex")
g.map(plt.scatter,"population","suicides_no", alpha=.7)
g.add_legend()

In [None]:
g = sns.FacetGrid(data, col="generation", hue="sex")
g.map(plt.scatter,"HDI for year","suicides_no", alpha=.7)
g.add_legend()

**As could be seen in few charts plotted, Age and generation, as well as HDI, is correlative to the suicide rate**

# The pearson's correlation might give us some more insights
let's take a look

In [None]:
corr = data.corr()
corr["suicides_no"].sort_values(ascending=False)#relative to suicides numbers

**Population has the highest correlation according to the pearson's corr. GDP is weakly correlated, Year is negatively correlated**

In [None]:
corr['HDI for year'].sort_values(ascending=False) #now let's take a look at the correlation relative to HDI

**GDP per capita has the highest correlation**

### following function is gonna help us whenever we need to check null entries

In [None]:
#only shows null values. 
## shows the percentage of null values
def missing_values_calculate(trainset): 
    nulldata = (trainset.isnull().sum() / len(trainset)) * 100
    nulldata = nulldata.drop(nulldata[nulldata == 0].index).sort_values(ascending=False)
    ratio_missing_data = pd.DataFrame({'Ratio' : nulldata})
    return ratio_missing_data.head(30)

In [None]:
missing_values_calculate(data)

# That was expected, Human development index (HDI) is the only value that we have nulls. Gonna create a simple model to fill those null values
Let's plot the suidices_no by HDI for year and see what it shows to us.

In [None]:
data.groupby("HDI for year")['suicides_no'].sum().plot()

**I wasn't expecting it at all, the closest to 1 the country are, you have higher values for suicide, remembering that HDI is the indice for human development. Notice, tho, that after the peak around 0.94, it decreases steply. Let's take a look and plot which countries have the highest HDI**

In [None]:
sns.set(rc={'figure.figsize':(22,4)}) #setting the figure size
data.groupby('country')['HDI for year'].mean().sort_values(ascending=False).head(20).plot(kind='bar') #top 20 countries by HDI
plt.xticks(fontsize=15)
plt.title("TOP 20 countries by Human development index", fontsize=20)

**now, let's group countries and HDI and see the suicide rate**

In [None]:
sns.set(rc={'figure.figsize':(22,4)}) #setting the figure size
data["HDI for year"] = data["HDI for year"].round(2) #making it easier to read, rounding the value up to 2 decimals.
data.groupby(["country","HDI for year"])["suicides_no"].sum().sort_values(ascending=False).head(80).plot(kind="bar", stacked=True)
plt.xticks(fontsize=15)
plt.title("Distribution of suicides by country and HDI", fontsize=20)

**Some countries, despite having a higher HDI, is shown to have a higher suicide rate.**

In [None]:
##to be continued