In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Read suicides dataframe
sd = pd.read_csv('/kaggle/input/suicide-rates-overview-1985-to-2016/master.csv')

In [None]:
# What's in the table?
sd.head()

In [None]:
# Erase year 2016 (It has wrong information)
sd = sd[sd.year != 2016]

In [None]:
# See if the names of columns has an extra spacer of something
list(sd.columns.values) 

As you can see gdp_for_year ($)  has two extra spaces. These extra characters produces problems.

In [None]:
# Rename columns for better code
sd.rename(columns={"suicides_no": "suicides", "suicides/100k pop": "s/100k", 
                   " gdp_for_year ($) ": "gdp_for_year", "gdp_per_capita ($)": "gdp" }, inplace=True)

In [None]:
list(sd.columns.values) 

In [None]:
# Deleting columns that I not going to use
sd.drop(['country-year', 'HDI for year', 'gdp_for_year'], axis=1, inplace=True)

In [None]:
sd

In [None]:
# How many unique countries
len(sd.country.unique())

In [None]:
sd.describe()

In [None]:
# How many women and men?
t1 = pd.pivot_table(sd, values='suicides', index=['country'], columns=['sex'], aggfunc=np.sum)
plt.figure(figsize=(8, 7))
splot = t1[['male','female']].sum().plot.bar(color=['red', 'blue'])
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.0f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')

In [None]:
splot = t1[['male','female']].sum().plot.pie(autopct='%1.1f%%')
plt.ylabel("Sex")

In [None]:
# Study generations
sd.generation.unique()

In [None]:
# How many by generation?
t2 = pd.pivot_table(sd, values='suicides', index=['country'], columns=['generation'], aggfunc=np.sum)
plt.figure(figsize=(8, 7))
splot = t2[['G.I. Generation', 'Silent','Boomers', 'Generation X', 'Millenials', 'Generation Z']].sum().plot.bar(color=['green','yellow', 'red', 'blue', 'brown'])
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.0f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')

In [None]:
# Study age range
sd.age.unique()

In [None]:
# How many by generation?
t3 = pd.pivot_table(sd, values='suicides', index=['country'], columns=['age'], aggfunc=np.sum)
plt.figure(figsize=(8, 7))
splot = t3[['5-14 years', '15-24 years','25-34 years', '35-54 years', '55-74 years', '75+ years']].sum().plot.bar(color=['green','yellow', 'red', 'blue', 'brown'])
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.0f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')

In [None]:
# How many by generation?
t3 = pd.pivot_table(sd, values='suicides', index=['country'], columns=['age'], aggfunc=np.sum)
plt.figure(figsize=(8, 7))
splot = t3[['5-14 years', '15-24 years','25-34 years', '35-54 years', '55-74 years', '75+ years']]\
.sum().plot.pie(autopct='%1.1f%%', explode=(0, 0, 0, 0.1, 0, 0) )
plt.ylabel("Age")

In [None]:
# Which country has more suicides in total?
sd[sd['suicides']==sd['suicides'].max()]['country']

In [None]:
# Top 20 countries with more suicides
t1 = pd.pivot_table(sd, values='s/100k', index=['country'], columns=['sex'], aggfunc=np.sum)
t1['Total'] = t1['female'] + t1['male']
t1.sort_values(by = 'Total',ascending = False).head(20)

In [None]:
# Create new table with nº of suicides per age and year
t2 = pd.pivot_table(sd, values='suicides', index=['year'], columns=['age'], aggfunc=np.sum)

In [None]:
t2

In [None]:
# Studying young people
t2.drop(columns=['35-54 years', '55-74 years', '75+ years'], inplace =True)

In [None]:
# General plot for obtaining the top years
t2.plot(kind='barh', figsize=(20,10))

In [None]:
# Range with the bigger suicedes rate
sub_t2 = t2.iloc[10:19]

In [None]:
sub_t2.plot(kind='barh', figsize=(20,10))

In [None]:
# Create new table with nº of suicides per sex and year
pd.pivot_table(sd, values='suicides', index=['year'], columns=['sex'], aggfunc=np.sum).plot.bar(figsize=(20,10))


In [None]:
ct=pd.pivot_table(sd, values='suicides', index=['year'], columns=['sex'], aggfunc=np.sum)
# Comparing the worst year of lower (1987) and the worst higger (2000)
# Percentage
100*(ct.iloc[2][1]/ct.iloc[15][1])

In [None]:
# Study generations
sd.generation.unique()

In [None]:
# Top 20 countries with more suicides in the millenials
# Suicides per 100k population
t3 = pd.pivot_table(sd, values='s/100k', index=['country'], columns=['generation'], aggfunc=np.sum)
t3.sort_values(by = 'Millenials',ascending = False).head(20)

In [None]:
# Top 20 countries with more suicides in the Generation X
# Suicides per 100k population
t4 = pd.pivot_table(sd, values='s/100k', index=['country'], columns=['generation'], aggfunc=np.sum)
t4.sort_values(by = 'Generation X',ascending = False).head(20)

In [None]:
# Top 20 countries with more suicides in the Generation Z
# Suicides per 100k population
t5 = pd.pivot_table(sd, values='s/100k', index=['country'], columns=['generation'], aggfunc=np.sum)
t5.sort_values(by = 'Generation Z',ascending = False).head(20)

In [None]:
lit = sd[sd['country']=='Lithuania']
lit

In [None]:
# Relation between GDP and Suicides
sns.jointplot(x='gdp', y='suicides', kind='reg', data=lit)

In [None]:
# How many by generation?
tl = pd.pivot_table(lit, values='suicides', index=['country'], columns=['age'], aggfunc=np.sum)
plt.figure(figsize=(8, 7))
tl_o = tl[['5-14 years', '15-24 years','25-34 years', '35-54 years', '55-74 years', '75+ years']].sum().plot.bar(color=['green','yellow', 'red', 'blue', 'brown'])
for p in tl_o.patches:
    tl_o.annotate(format(p.get_height(), '.0f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')

In [None]:
# How many by generation?
t2 = pd.pivot_table(lit, values='suicides', index=['country'], columns=['generation'], aggfunc=np.sum)
plt.figure(figsize=(8, 7))
splot = t2[['G.I. Generation', 'Silent','Boomers', 'Generation X', 'Millenials', 'Generation Z']].sum().plot.bar(color=['green','yellow', 'red', 'blue', 'brown'])
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.0f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')

In [None]:
sd1 = sd.groupby(['country'], as_index = False).sum()
sd1

In [None]:
!pip install chart_studio

In [None]:
# Map plot with the suicides number
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot


layout = dict(title='Number of suicides', geo=dict(showframe=False, projection={'type': 'natural earth'}))

data = go.Choropleth(locations=sd1['country'], 
                     locationmode='country names', 
                     z=sd1['suicides'], 
                     colorscale='Viridis', 
                     colorbar={'title': 'Suicides'})

fig = go.Figure(data=[data], layout=layout)

iplot(fig)

In [None]:
sd.columns

In [None]:
y = sd['suicides']

In [None]:
X = sd[['gdp', 'population']]

In [None]:
# Create a test data
from sklearn.model_selection import train_test_split

In [None]:
# Use random_state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=45)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# Linear regression object
lm = LinearRegression()

In [None]:
# Fit the training data
lm.fit(X_train,y_train)

In [None]:
# The coefficients
print('Coefficients: \n', lm.intercept_)

In [None]:
# The coefficients
print('Coefficients: \n', lm.coef_)

In [None]:
cdf = pd.DataFrame(lm.coef_, X.columns, columns=['Coeff'])
cdf.head()

In [None]:
# We give to the model data that never saw (in our case X_text)
# We train our model with X_train
predictions = lm.predict( X_test)

In [None]:
plt.scatter(y_test,predictions)
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')

In [None]:
# calculate these metrics by hand!
from sklearn import metrics
import numpy as np

print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

In [None]:
sns.distplot((y_test-predictions),bins=40);