In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import re


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

## Intro

Here is my first attempt at a Kaggle Notebook, while figuring out the features of the platform.
Below is an analysis of the suicide data set, along with a "% of happiness" score, where I tri to find some correlations (if any).

#### [1. Cleaning the data](#clean)
#### [2. Plotting statistics](#plot)
#### [3. Analyzing data](#analyze)
#### [4. Happiness dataset](#happy)

<a name="clean"></a>
## 1. Cleaning the data 
We start with a basic look over the dataset, to see what we have to work with.

In [None]:
data = pd.read_csv('../input/suicide-rates-overview-1985-to-2016/master.csv')

In [None]:
# first look

data.head()

In [None]:
data.info()

First, we will begin data cleaning by examining the columns, filling in missing values, transforming values, etc

In [None]:
data.isnull().sum()

In [None]:
# we'll drop 'HDI for year'(too many missing values) and 'country-year' column (redundant info)
data.drop(['HDI for year','country-year'],axis =1, inplace=True)

In [None]:
data.isnull().sum()

In [None]:
data.describe()

The data seems OK, we can star plotting some of it.


<a name="plot"></a>
## 2. Plotting statistics

Here I will try to use some seaborn style graphs in order to analyze the data.

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x='country',y='suicides_no',data=data.groupby('country')['suicides_no'].sum().sort_values(ascending=False).head().reset_index())
plt.ylabel('Total suicides')
plt.xlabel('Country')
plt.title('Countries with the most suicides from 1985 to 2016')
sns.despine()
top_5 = data.groupby('country')['suicides_no'].sum().sort_values(ascending=False).head().reset_index()['country'].values.tolist()

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x='sex',y='suicides_no',data=data.groupby('sex')['suicides_no'].sum().sort_values(ascending=False).head().reset_index())
plt.ylabel('Total suicides')
plt.xlabel('Genre')
plt.title('Distribution by sex from 1985 to 2016')
sns.despine()

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x='age',y='suicides_no',data=data.groupby('age')['suicides_no'].sum().sort_values(ascending=False).head().reset_index())
plt.ylabel('Total suicides')
plt.xlabel('Age group')
plt.title('Distribution by age group from 1986 to 2016')
sns.despine()

In [None]:
plt.figure(figsize=(20,10))
sns.lineplot(x='year',y='suicides_no',data=data.groupby('year')['suicides_no'].sum().sort_values(ascending=False).reset_index())
plt.ylabel('Total suicides')
plt.xlabel('Year')
plt.title('Evolution of total suicides by year')
sns.despine()

In [None]:

plt.figure(figsize=(20,10))
sns.lineplot(x='year',y='suicides_no',hue='country', data=data[data['country'].isin(top_5)].groupby(['country','year'])['suicides_no'].sum().reset_index())
sns.despine()
plt.title('Evolution of the sucide rates by year for the 5 countries with most suicides');

The countries with the most number of suicides are a mix of first-world countries (which you wouldn't expect) : France, USA and Japan, and also two slavic countries, Russia and Ukraine. These results are counter-intuitive somehwat , because one assumes that a higher standard of living means less reasons to commit suicide. Let's take a closer look at the data.

<a name="analyze"></a>
## 3. Analyzing data

The plots left us wondering what is (if any) the connection between the countries with the most suicides. Becuase our dataset provides us with some info on the GDP per capita for each nation, we can look more into that.

In [None]:

d = pd.merge(data.groupby(['country','year'])['gdp_per_capita ($)'].mean(),data.groupby(['country','year'])['suicides_no'].sum(), on=['country','year']).reset_index()

d.head()

In [None]:
d[['gdp_per_capita ($)','suicides_no']].corr()

In [None]:
#cannot use sns.lmplot here, there seem to be too many variables in hue for the regression to work

plt.figure(figsize=(20,10))
sns.scatterplot(x='gdp_per_capita ($)',y='suicides_no',hue='country', data=d)
plt.legend('')
sns.despine()


Money doesn't bring happiness, and this is what the data shows us too. There is no (clear) link between a decrease in the number of suicides as the money increases.
But we can see that above a value of aprox. 65000 $ per capita, the total suicides are not that spread out anymore, they are almost a straight line. Let's find those countries.


In [None]:
plt.figure(figsize=(20,10))
sns.scatterplot(x='gdp_per_capita ($)',y='suicides_no',hue='country', data=d[d['gdp_per_capita ($)']>65000],s=50)
plt.ylim(0,60000)
sns.despine()


In [None]:
d[d['gdp_per_capita ($)']>65000][['gdp_per_capita ($)','suicides_no']].corr()


Keeping the same y-axis limit as the first plot, it is clear that these countries have some of the lowest suicide ratings, and besides Luxembourg and San Marino (which are macro-nations, thus not a big sample of a population to draw conclusions from regarding the relationship between GDP and total suicides), the other countries have low suicide rates.

Making this realisation, it's obviously bettee to see the correlation between the number of suicides for 100k citizens (suicides/100k pop) than the straight up number of suicides

In [None]:
d_100k = pd.merge(data.groupby(['country','year'])['gdp_per_capita ($)'].mean(),data.groupby(['country','year'])['suicides/100k pop'].sum(), on=['country','year']).reset_index()

plt.figure(figsize=(20,10))
sns.scatterplot(x='gdp_per_capita ($)',y='suicides/100k pop',hue='country', data=d_100k)
plt.legend('')
sns.despine()

In [None]:
d_100k[['gdp_per_capita ($)','suicides/100k pop']].corr()

Even though it paints a different picture and there are obvious more cases in under-developed countries, the correlation is almost 0 so, again, no conclusion can be drawn.


<a name="happy"></a>
## 4. Happiness dataset

But does happinnes mean money? Or does unhappiness mean suicide? We will load a new data set ( the original can be found here: https://ourworldindata.org/happiness-and-life-satisfaction )

In [None]:
happy = pd.read_csv('../input/share-of-people-who-say-they-are-happy/share-of-people-who-say-they-are-happy.csv')

In [None]:
happy.head()

In [None]:
happy.columns

In [None]:
# data cleaning (dropping the 'code' column) and creating a new data-frame for us to analyse
happy.drop('Code',axis=1, inplace=True)
happy = happy.rename(columns={'Entity':'country','Year':'year',' (%)':'happy'})
happy.loc[happy['country']=='Russia','country'] = "Russian Federation"

In [None]:
new_d = pd.merge(d,happy, on=['country','year'])

The happy data, however, was missing a lot of years, but we will not advance in imputing values for now, we'll just see how well the happiness and suicides number correlate.

In [None]:
new_d[['suicides_no','happy']].corr()

Surprinsingly, no correlation here either. Let's see if this is the case by taking the countries with the most suicides and getting some happiness data on them.

In [None]:
most_suicides = data.groupby('country')['suicides_no'].sum().sort_values(ascending=False).head().reset_index().country.tolist()

In [None]:
plt.figure(figsize=(20,10))
sns.scatterplot(x='happy',y='suicides_no',hue='country', data=new_d[new_d['country'].isin(most_suicides)],s=50)
plt.legend('')
sns.despine()


So happiness or money have nothing to do with suicide rates. Are those two related though?

In [None]:
new_d[['gdp_per_capita ($)','happy']].corr()

In [None]:
#it appeas so, more or less... let's plot it

plt.figure(figsize=(20,10))
sns.scatterplot(x='happy',y='gdp_per_capita ($)',hue='country', data=new_d)
plt.legend('')
sns.despine()


## Conclusion

From this data we cannot determine if a factor such as happiness or wealth has any relation with the tendency to suicide, so further analysis with other data-sets is needed.