In [None]:
#Relationship between Suicide Rate and Happiness Report Factors of the Countries
'''The purpose of this analysis is to find if there is a relationship between the suicide rate of a country and its happiness score, GDP,
family (social support), Health (Life Expectancy), Freedom, Trust (Government Corruption) and Generosity factors, which all could be find in the 
world happiness report'''

In [None]:
#import the libraries  
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 
from scipy import stats

In [None]:
#import suicide rates dataset 
suicide_rates_GPD = pd.read_csv('../input/gdp-per-capita-and-suicide-rates/Dataset.csv')

In [None]:
#explore the dataset
#see the first 5 rows
suicide_rates_GPD.head()

In [None]:
#see the number of columns & rows 
suicide_rates_GPD.shape

In [None]:
#see details about the columns
suicide_rates_GPD.info()

In [None]:
#see if there is missing value
suicide_rates_GPD.isnull().sum()

In [None]:
#clean the data
#remove the unnecessary columns 
suicide_rates_GPD.drop(['Country Code', 'GDP_PerCapita_2000 US$', 'SuicideRate_2000', 'GDP_PerCapita_2005 US$', 'SuicideRate_2005', 'GDP_PerCapita_2010 US$', 'SuicideRate_2010', 'GDP_PerCapita_2015 US$', 'GDP_PerCapita_2016 US$'], axis=1, inplace=True)
#I removed the columns that contains GDP for 2015 & 2016, because they have missing values, and the happiness report have this data 

In [None]:
#make the country name columns the index so we can join the datasets after
suicide_rates_GPD.set_index('CountryName', inplace=True)

In [None]:
suicide_rates_GPD.head()

In [None]:
#we need to split this dataset to two datasets, one for 2015, and the other for 2016
suicide_rates_2015 = pd.DataFrame(suicide_rates_GPD.iloc[:, 0:1])
suicide_rates_2015.head()

In [None]:
suicide_rates_2016 = pd.DataFrame(suicide_rates_GPD.loc[:, ['SuicideRate_2016']])
suicide_rates_2016.head()

In [None]:
#make sure these two datasets are clean and there is no missing value
suicide_rates_2015.isnull().sum()

In [None]:
suicide_rates_2016.isnull().sum()

In [None]:
#let's apply some analysis on suicide rates for 2015 & 2016
#apply boxplot to the dataset to explore the min, max, median, & outliers
suicide_rates_2015.boxplot()

In [None]:
#the boxplot shows that the top 5 countries are outliers, which means that the top 5 countries have suicide rate that differ so much from the rest of the world
#let's see what are these countries
top5_counrties_2015 = suicide_rates_2015.sort_values(by='SuicideRate_2015', ascending=False).head(5)
top5_counrties_2015

In [None]:
#interesting that 4 of the top 5 countries are in Europe (Kazakhstan locates in Eurasia)

In [None]:
#Let's analyze the suicide rates for 2016 as we did with the suicide rates for 2015 data
#apply boxplot to the dataset to explore the min, max, median, & outliers
suicide_rates_2016.boxplot()

In [None]:
#we can see there was a slight decline in the number of outliers (from 5 to 4), in the max rates (in 2015 the max rate was almost 35 & in 2016 the max rate was almost 32)

In [None]:
#let's see the top 5 countries for 2016
top5_counrties_2016 = suicide_rates_2016.sort_values(by='SuicideRate_2016', ascending=False).head(5)
top5_counrties_2016

In [None]:
#besides the slight decline in suicide rates for the top 5 countries in 2016, Kazakhstan was not in the top 5 countries
#instead Suriname took the 5th place, & as a result, there were 3 counrties from Eurpoe, and 2 from South America

In [None]:
#we finished cleaning and suicide rate datasets, let's move to world happiness report datasets
#import world happiness report 2015 dataset
World_Happiness_Report_2015= pd.read_csv('../input/world-happiness/2015.csv')

In [None]:
#explore the dataset
#see the first 5 rows
World_Happiness_Report_2015.head()

In [None]:
#see the number of columns & rows 
World_Happiness_Report_2015.shape

In [None]:
#see details about the columns
World_Happiness_Report_2015.info()

In [None]:
#see if there is missing value
World_Happiness_Report_2015.isnull().sum()

In [None]:
#clean the data
#remove the unnecessary columns 
World_Happiness_Report_2015.drop(['Region','Happiness Rank', 'Standard Error', 'Dystopia Residual'], axis=1, inplace=True)

In [None]:
#make the country name columns the index so we can join the datasets after
World_Happiness_Report_2015.set_index('Country', inplace=True)

In [None]:
#see how the dataset look like after cleaning
World_Happiness_Report_2015.head()

In [None]:
World_Happiness_Report_2015.shape

In [None]:
#import world happiness report 2016 dataset
World_Happiness_Report_2016= pd.read_csv('../input/world-happiness/2016.csv')

In [None]:
#explore the dataset
#see the first 5 rows
World_Happiness_Report_2016.head()

In [None]:
#see the number of columns & rows 
World_Happiness_Report_2016.shape

In [None]:
#see details about the columns
World_Happiness_Report_2016.info()

In [None]:
#see if there is missing value
World_Happiness_Report_2016.isnull().sum()

In [None]:
#clean the data
#remove the unnecessary columns 
World_Happiness_Report_2016.drop(['Region', 'Happiness Rank', 'Lower Confidence Interval','Upper Confidence Interval', 'Dystopia Residual'], axis=1, inplace=True)

In [None]:
#make the country name columns the index so we can join the datasets after
World_Happiness_Report_2016.set_index('Country', inplace=True)

In [None]:
#see how the dataset look like after cleaning
World_Happiness_Report_2016.head()

In [None]:
World_Happiness_Report_2016.shape

In [None]:
#now we need to merge the datasets for 2015 in one dataset & the datasets for 2016 in another dataset, so we will have 2 datasets,
#one for suicide rates & happiness report for 2015 dataset & the other one for suicide rates & happiness report for 2016 dataset
suiciderate_happiness_report_2015 = World_Happiness_Report_2015.join(suicide_rates_2015, how='inner')
suiciderate_happiness_report_2015.head()

In [None]:
suiciderate_happiness_report_2016 = World_Happiness_Report_2016.join(suicide_rates_2016, how='inner')
suiciderate_happiness_report_2016.head()

In [None]:
#now we have 2 datasets, suicide rate & happiness report for 2015 datset & suicide rate & happiness report for 2016 datset
#now everything is ready for apply some inferential analysis

In [None]:
#see the correlation between the variables in the suicide rate & happiness report for 2015 datset
suiciderate_happiness_report_2015.corr()

In [None]:
#create a correlation matrix using heat map
corr = suiciderate_happiness_report_2015.corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)

In [None]:
#let's go more deep 
#first, create a function that will return a scatter plot with a regression line, the Pearson Correlation Coefficient, the p-value 
#& returns if there is a correlation & if the correlation is negative or positive and if it is weak, moderate, strong, or very
#strong, & if it is significant or not
def correlation(x, y, t):
    sns.regplot(x, y)
    plt.title('Correlation between suicide rate &' + ' ' + t)
    pearson_coef, p_value = stats.pearsonr(x, y)
    print('The Pearson Correlation Coefficient is', pearson_coef, 'with a P-value of P =', p_value)
    if 0 <= pearson_coef <= 0.19:
        print('There is NO correlation')
    elif 0.20 <= pearson_coef <= 0.40:
        print('Weak positive correlation')
    elif 0.40 <= pearson_coef <= 0.59:
        print('Moderate positive correlation')
    elif 0.60 <= pearson_coef <= 0.79:
        print('Strong positive correlation')
    elif 0.80 <= pearson_coef <= 1:
        print('Very stong positive correlation')
    elif -0.19 <= pearson_coef <= -0.01:
        print('There is NO correlation')
    elif -0.39 <= pearson_coef <= -0.20:
        print('Weak negative correlation')
    elif -0.59 <= pearson_coef <= -0.40:
        print('Moderate negative correlation')
    elif -0.79 <= pearson_coef <= -0.60:
        print('Strong negative correlation')
    elif -0.80 >= pearson_coef >= -1:
        print('Very stong negative correlation')

    if p_value <= 0.05 and 0.20 <= pearson_coef <= 1:
        print('This positive correlation is significant')
    elif p_value <= 0.05 and -0.20 >= pearson_coef >= -1:
        print('This negative correlation is significant')
    elif p_value > 0.05 and 0.20 <= pearson_coef <= 1:
        print('This positive correlation is NOT significant')
    elif p_value > 0.05 and -0.20 >= pearson_coef >= -1:
        print('This negative correlation is NOT significant')


In [None]:
#second, make y2015 variable that presents suicide rate for 2015
y2015 = suiciderate_happiness_report_2015['SuicideRate_2015']

In [None]:
#now find the correlation between suicide rate for 2015 & each indicator of happiness report for 2015 (7 factors/indicators)
#correlation between suicide rate & Happiness Score 2015
correlation(suiciderate_happiness_report_2015['Happiness Score'], y2015,'Happiness Score 2015')

In [None]:
#correlation between suicide rate & Economy (GDP per Capita) 2015
correlation(suiciderate_happiness_report_2015['Economy (GDP per Capita)'], y2015,'Economy (GDP per Capita) 2015')

In [None]:
#correlation between suicide rate & Family (social support) 2015
correlation(suiciderate_happiness_report_2015['Family'], y2015,'Family (social support) 2015')

In [None]:
#correlation between suicide rate & Health (Life Expectancy) 2015
correlation(suiciderate_happiness_report_2015['Health (Life Expectancy)'], y2015,'Health (Life Expectancy) 2015')

In [None]:
#correlation between suicide rate & Freedom 2015
correlation(suiciderate_happiness_report_2015['Freedom'], y2015,'Freedom 2015')

In [None]:
#correlation between suicide rate & Trust (Government Corruption) 2015
correlation(suiciderate_happiness_report_2015['Trust (Government Corruption)'], y2015,'Trust (Government Corruption) 2015')

In [None]:
#correlation between suicide rate & Generosity 2015
correlation(suiciderate_happiness_report_2015['Generosity'], y2015,'Generosity 2015')

In [None]:
#based on this analysis, there are weak positive correlations between suicide rate (dependent variable/ target) 
#& Happiness Score, Economy (GDP per Capita), Family and Health (Life Expectancy) for 2015 (independent variables/ predictors)
#and these correlations are significant

In [None]:
#to be more sure, let's apply the analysis on the suicide rate & happiness report for 2016 datset

In [None]:
#see the correlation between the variables in the suicide rate & happiness report for 2016 datset
suiciderate_happiness_report_2016.corr()

In [None]:
#create a correlation matrix using heat map
corr = suiciderate_happiness_report_2016.corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)

In [None]:
#make y2016 variable that presents suicide rate for 2016
y2016 = suiciderate_happiness_report_2016['SuicideRate_2016']

In [None]:
#we will use the correlation function we've created to find the correlation between suicide rate for 2015 & each indicator of happiness report for 2016 (7 factors/indicators)
#correlation between suicide rate & Happiness Score 2015
correlation(suiciderate_happiness_report_2016['Happiness Score'], y2016,'Happiness Score 2016')

In [None]:
#correlation between suicide rate & Economy (GDP per Capita) 2016
correlation(suiciderate_happiness_report_2016['Economy (GDP per Capita)'], y2016,'Economy (GDP per Capita) 2016')

In [None]:
#correlation between suicide rate & Family (social support) 2016
correlation(suiciderate_happiness_report_2016['Family'], y2016,'Family (social support) 2016')

In [None]:
#correlation between suicide rate & Health (Life Expectancy) 2016
correlation(suiciderate_happiness_report_2016['Health (Life Expectancy)'], y2016,'Health (Life Expectancy) 2016')

In [None]:
#correlation between suicide rate & Freedom 2016
correlation(suiciderate_happiness_report_2016['Freedom'], y2016,'Freedom 2016')

In [None]:
#correlation between suicide rate & Trust (Government Corruption) 2016
correlation(suiciderate_happiness_report_2016['Trust (Government Corruption)'], y2016,'Trust (Government Corruption) 2016')

In [None]:
#correlation between suicide rate & Generosity 2016
correlation(suiciderate_happiness_report_2016['Generosity'], y2016,'Generosity 2016')

In [None]:
#as in the previous analysis,there are weak positive correlations between suicide rate (dependent variable/ target) 
#& Happiness Score, Economy (GDP per Capita), Family and Health (Life Expectancy) for 2016 (independent variables/ predictors)
#and these correlations are significant

In [None]:
#Results
#according to these analyses, we can state that a country that has a higher happiness score, higher GDP, 
#higher family (social support), or higher Health (Life Expectancy), tends to have a higher suicide rate.
#Also, we can state that the developed countries tend to have higher suicide rates