In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

Let's see how this data looks like

In [None]:
suicide = pd.read_csv('/kaggle/input/suicide-rates-overview-1985-to-2016/master.csv')
suicide.head(10)

A quick first look. There seems to be a lot of null values in HDI column. Let's see how many data available in total and how many of them are missing in each column

In [None]:
suicide.info()

'gdp-for-year' is a categorical column instead of int column. We convert this column into int dtype

In [None]:
suicide[' gdp_for_year ($) ']=suicide[' gdp_for_year ($) '].apply(lambda val: val.replace(',', ''))
suicide[' gdp_for_year ($) '] = pd.to_numeric(suicide[' gdp_for_year ($) '])
suicide[' gdp_for_year ($) ']

It is a good news that we all data for most of our columns except HDI for year. We may consider dropping it depending on how important it is to predict the result.

There are 6 categorical columns. We may consider about transform these columns into classifications. The same with some numerical columns such as year, suicides_n. 

Let's explore the data more before considering about what we should do with each feature

In [None]:
suicide.describe()

Let's find out how many countries are in this data

In [None]:
suicide['country'].nunique()

A pretty good data with a variety of countries. And those countries are:

In [None]:
suicide['country'].unique()

There seems to be a lot of repetives in generation. Shall we explore a little bit more about this feature?

In [None]:
suicide['generation'].value_counts()

This becomes more interesting. I am intrigued to find our more about the difference between generations and how the those difference affects the suicide rates. But before I do that, I should split my data into train and test set so that I am not prone to choose some models and find insights because of the test set.

Use 20% of our data for testing. We choose a random state of 1 so that we have the same test set every time we rerun the code

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(suicide, test_size=0.2, random_state = 1)

Let's see if the function split the data the way we want it to be

In [None]:
test.count()[0]/suicide.count()[0]*100

the test set is 20% of the orignal data. Perfect

Let's take a look at our train data

In [None]:
train.describe()

In [None]:
train.head(10)

Observe the distribution of each category in each column to make sure the test set is the representation of the whole population. We don't want to have so many more data of one category compared to the others.

In [None]:
import seaborn as sns
sns.countplot('sex',data=test)

In [None]:
sns.countplot('age',data=test)

Looks like the distribution of sex and age are fairly equal 

The generation is quite important factor if suicide rate. We want to make sure that the generation in the test and the train set is the representation of the whole population

Observe the distribution of each geneneration compared to other generations in test set

In [None]:
test['generation'].value_counts()/len(test)

In [None]:
suicide['generation'].value_counts()/len(suicide)

The test set is quite a good representation of the test set. But could we make it better?

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state = 1)
for train_index, test_index in split.split(suicide, suicide['generation']):
    strat_train = suicide.loc[train_index]
    strat_test = suicide.loc[test_index]

Check our stratified test set

In [None]:
strat_test['generation'].value_counts()/len(strat_test)

Seems like the new strat_test represents the population better compared to the random test set. We put everything into one table to we could see the difference better

In [None]:
#Create a function to compare the proportions of different set
def generation_proportions(test_set):
    return test_set['generation'].value_counts()/len(test_set)
compare_props = pd.DataFrame({
                'Overall': generation_proportions(suicide),
                'Random': generation_proportions(test),
                'Stratified': generation_proportions(strat_test)}
                            )
compare_props['%err random'] = 100*(compare_props['Random'] - compare_props['Overall'])/ compare_props['Overall']
compare_props['%err stratified'] = 100*(compare_props['Stratified'] - compare_props['Overall'])/ compare_props['Overall']

In [None]:
compare_props

Our stratified test set represents the population significanly better. 

Now it is time for us to explore the train data. Make a copy of the training set so that we do not harm the training set

In [None]:
suicide = strat_train.copy()

Observe the correlation between different features with the rates of suicide

In [None]:
corr_mat = suicide.corr()['suicides/100k pop'].sort_values(ascending=False)
corr_mat

In [None]:
sns.heatmap(suicide.corr(),annot=True)

In [None]:
sns.pairplot(suicide)

There seems to be no strong correlation between other features and suicide rates. 

In [None]:
suicide.head(10)

In [None]:
import matplotlib.pyplot as plt
suicide['age'].replace({'5-14 years':'05-14','15-24 years':'15-24','25-34 years':'25-34','35-54 years':'35-54','55-74 years':'55-74','75+ years':'75+'},inplace=True)
sns.set_style('whitegrid')
sns.catplot('age','suicides/100k pop',kind='bar',data=suicide.sort_values(by='age'), hue ='sex',palette='coolwarm')
plt.xlim(0,5.5)

Look like there are much more male committing suicide in each age range than female. And the gap gets bigger as the age range increase 

In [None]:
sns.distplot(suicide['suicides/100k pop'])

Top 10 countries with highest suicide rate

In [None]:
countries = suicide.groupby('country').mean().sort_values(by='suicides/100k pop',ascending=False)['suicides/100k pop']
countries.head(10)

Use geopandas to visualize the distribution of suicide rates on the world map

In [None]:
import geopandas as gpd
import geoplot as gplt
#Create a variable holding the map of the world
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
world.plot()

In [None]:
#Merge the map data and our suicide data
merge = world.set_index('name').join(countries,how='inner')
merge.head()

In [None]:
merge.describe()

It seems like we can just get access to 78 countries out of 101 countries from our map data. But good enough for us to visualize

Compare the suicide rate and population

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2, sharey=True,figsize=(20,20))
ax1.set_title('Suicide Rate')
ax2.set_title('Population')
merge.plot(column='suicides/100k pop',cmap='Reds',ax=ax1)
merge.plot(column='pop_est',cmap='Reds',ax=ax2)

The countries with less population tend to be the countries with higher suicide rate. 

In [None]:
merge[['suicides/100k pop','pop_est']].corr()

Although the correlation is not really strong, the negative sign valididates our observation. This correlation cannot indicate anything since the rate of suicide is collected from different years and the population of each country varies every year.

Look at the distribution of the year 

In [None]:
sns.distplot(suicide['year'])

There is an increase in suicide rates from 1985 to 2000. This can be due to increase in stress as technology develops or due to the increase in the quality of recording data

Now it is time to prepare the data for machine learning algorithms.

In [None]:
#Seperate the feature we wants to predict from the training data
#Drop suicides_no and suicides/100k pop they are dependent variables. We could easily predict the rate of suicides by using suicides_no and population.  
suicide = strat_train.drop(['suicides_no','country-year'],axis=1)
suicide_labels = strat_train['suicides/100k pop']

Next step is to clean our data.
Let's take a look at our data again

In [None]:
suicide.info()

From the information of the data, we identify two first things we could do to prepare the data for training:
1. Convert categorical data into numerical data
2. Fill in the missing data

Let's start with task 1. 

We will use LabelEncoder to transform text categories to integer categories.

In [None]:
from sklearn.preprocessing import LabelEncoder

cat_attribs = suicide[[column for column in suicide.columns if suicide[column].dtype == 'object']]

le = LabelEncoder()

suicide_cat = cat_attribs.apply(lambda col: le.fit_transform(col))

suicide_cat.head(10)


ML algorithm will consider the closer integer to have more simlarity, which is not the case here. The second option is to use one hot encoder to change our categorical columns into vectors of values 0 and 1

In [None]:
suicide_cat_dummies = pd.get_dummies(suicide, columns=cat_attribs.columns, drop_first=True )
suicide_cat_dummies

Since this method would signicantly increase the dimension of our data, this could lead to over-fitting. We decide to preprocess the category data with LabelEncoder

In [None]:
1 - suicide['HDI for year'].count()/len(suicide)

There is 70% missing data in column 'HDI for year'. We could not use missing data for training model. We can either drop that feature entirely or fill in the misising data. Since this feature is important for training the model, we choose to fill in the missing data. 

To decide which value to fill in, first observe the distribution of 'HDI for year'

In [None]:
sns.distplot(suicide['HDI for year'].dropna())

In [None]:
suicide.describe()['HDI for year']

Since the mean and the median is quite similar, we feel safe to use median as value to fill in

In [None]:
median = suicide['HDI for year'].median()
filled_HDI = suicide['HDI for year'].fillna(median)
filled_HDI.describe()

Our filled HDI has roughly the same mean, min, and max as our original column. We want to preserve as much as we can the information of the original data. This result is what we want.

Now look at the distribution of HDI again with the values filled in

In [None]:
sns.distplot(filled_HDI)

Since about 70% of our data is filled with the median, it makes sense that the majority of our data centers in the median value. 

Take care of our missing values using Imputer to see if we obtain the same result. Imputer becomes handy if we want to fill in missing values in different columns at once

In [None]:
from sklearn.preprocessing import Imputer
 
imputer = Imputer(strategy='median')

num_attribs = suicide[suicide.columns[suicide.dtypes != 'object']]

#Since imputer just applies to numerical columns, we drop categorical columns
suicide_num = imputer.fit_transform(num_attribs)
suicide_num = pd.DataFrame(suicide_num,columns=suicide.columns[suicide.dtypes != 'object'])
suicide_num['HDI for year'].describe()



We got the same result for HDI column as we did earlier

Since ML algorithms do not do well with different scales between different features. We use Standard Scaler to standardize our values

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_suicide_num = scaler.fit_transform(suicide_num)

scaled_suicide_num = pd.DataFrame(scaled_suicide_num,columns=suicide.columns[suicide.dtypes != 'object'])

scaled_suicide_num

As we see from the dataframe, every of our numerical is standardized. This would help the ML algorithms run more efficiently.

So far we have:
1. Preprocess numerical data with Imputer, StandardScaler. 
2. Preprocess categorical data with LabelEncoder

We could combine steps in step 1 and 2 in 2 pipelines. Then combine those pipeline using ColumnTransformer

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

num_pipeline = Pipeline([
    ('imputer', Imputer(strategy='median')),
    ('scaler', StandardScaler(with_mean=False))
])

cat_pipeline = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore')),
    ('scaler', StandardScaler(with_mean=False))
])

full_pipeline = ColumnTransformer([
    ('num_pipeline', num_pipeline, list(num_attribs.columns)),
    ('cat_pipeline', cat_pipeline, list(cat_attribs.columns)),
])



In [None]:
suicide_prepared = full_pipeline.fit_transform(suicide)
type(suicide_prepared)

Now it is time for selecting and training model

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(suicide_prepared,suicide_labels)
lr_predictions = lr.predict(suicide_prepared)
lr_predictions




In [None]:
#Find the mean difference between the predictions and the real values 
(lr_predictions-list(suicide_labels)).mean()

Use Mean Absolute Error for more accurate evaluation

In [None]:
from sklearn.metrics import mean_squared_error

lrmse = np.sqrt(mean_squared_error(suicide_labels,lr_predictions))
lrmse

Try DecisionTreeRegressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

dr = DecisionTreeRegressor(random_state=0)
dr.fit(suicide_prepared,suicide_labels)
dr_predictions = dr.predict(suicide_prepared)

drmse = np.sqrt(mean_squared_error(suicide_labels,dr_predictions))
drmse

We get really good results with both linear regressor and decision tree regressor using mean squared error. Decision tree regressor seems to be a better choice than linear regression. But this is not an accurate evaluation since we use one data to train and evaluate. We could instead use Cross-Validation.

In [None]:
from sklearn.model_selection import cross_val_score

scores_1 = cross_val_score(dr, suicide_prepared, suicide_labels, scoring = "neg_mean_squared_error", cv = 10)
tree_scores = np.sqrt(-scores_1)
tree_scores.mean()

In [None]:
scores_2 = cross_val_score(lr, suicide_prepared, suicide_labels, scoring = "neg_mean_squared_error", cv = 10)
lr_scores = np.sqrt(-scores_2)
lr_scores.mean()

Now we have a better evaluation of our model. Now linear regression has about the same error score as the previous evaluation method while decision tree has much worse performance. This shows that the Decision Tree model is overfitting really badly.

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=30, random_state=42)
forest_reg.fit(suicide_prepared, suicide_labels)

scores_3 = cross_val_score(forest_reg, suicide_prepared, suicide_labels, scoring = "neg_mean_squared_error", cv = 10)
rf_scores = np.sqrt(-scores_3)
rf_scores.mean()

RandomForestRegressor performs well but not as good as LinearRegression. This indicates that the random forest regressor is also overfitting. We could choose the right hyperparameter to prevent overfitting. This can be done using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8, 10]}
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True,
                          error_score=np.nan)
grid_search.fit(suicide_prepared, suicide_labels) 

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_

In [None]:
cvres_df = pd.DataFrame(cvres)
cvres_df["mean_score"] = cvres_df['mean_test_score'].apply(lambda x:np.sqrt(-x) )
cvres_df[["mean_score","params"]].sort_values(by='mean_score')

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=10),
    }

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(suicide_prepared, suicide_labels)

In [None]:
rnd_search.best_params_
rnd_results = rnd_search.cv_results_
rnd_results = pd.DataFrame(rnd_results)
rnd_results['mean_score'] = rnd_results['mean_test_score'].apply(lambda x: np.sqrt(-x))
rnd_results[["mean_score","params"]].sort_values(by='mean_score')