# **Data analysis project: Suicide rates & mental health.**

In [None]:
pip install chart_studio

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from chart_studio import plotly
import chart_studio.plotly as py
import plotly.graph_objs as go 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
%matplotlib inline

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
sr= pd.read_csv("/kaggle/input/mental-health-and-suicide-rates/Age-standardized suicide rates.csv")

In [None]:
sr['Sex'].value_counts()

In [None]:
sr=sr.astype({'2000':'float','2010':'float','2015':'float','2016':'float'})

In [None]:
sr.head()

In [None]:
plt.figure(figsize=(12,4))
plt.ylim(8,13)
plt.title('Suicide Rate per 100.000 people (Global)')
sr[['2000','2010','2015','2016']].mean().plot(linestyle='dashed',color='orange',marker='o',markersize=10,markerfacecolor='yellow')

In [None]:
sr['sum']=sr[['2000','2010','2015','2016']].sum(axis=1)

In [None]:
sr.sort_values('sum',ascending=False,inplace=True)
sr=sr.reset_index().drop(columns='index')

In [None]:
sex = pd.get_dummies(sr['Sex'],drop_first=True)
sr=pd.concat([sr,sex], axis=1)

In [None]:
sr=sr.drop(columns='Sex')

In [None]:
sr.head()

In [None]:
sr.rename({' Male':'male'},axis=1,inplace=True)

In [None]:
#Males#
plt.figure(figsize=(12,4))
plt.title('10 countries with the highest suicide rate means for 2000 , 2010 , 2015 & 2016 data (Males)')
plt.ylim(100,270)
plt.xlabel('Country')
plt.ylabel('Sum of suicide rates')
sns.lineplot(x='Country',y='sum',data=sr.head(10),color='orange',linestyle='dashed',marker='o',markersize=10,markerfacecolor='yellow')
#Females#
plt.figure(figsize=(12,4))
plt.title('10 countries with the highest suicide rate means for 2000 , 2010 , 2015 & 2016 data (Females)')
plt.xlabel('Country')
plt.ylabel('Sum of suicide rates')
plt.ylim(20,150)
sns.lineplot(x='Country',y='sum',data=sr[sr['male']==0].head(10),color='orange',linestyle='dashed',marker='o',markersize=10,markerfacecolor='yellow')

In [None]:
sr.drop(' Female',axis=1,inplace=True)

In [None]:
sr.head()

In [None]:
srm=pd.melt(sr,id_vars=['Country','male','sum'],var_name='year',value_name='suicide rate')
srm.drop(columns='sum',inplace=True)

In [None]:
srm.sort_values('suicide rate',ascending=False,inplace= True)

In [None]:
srm=srm.reset_index().drop(columns='index')

In [None]:
srm.head()

In [None]:
srm['year'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X=srm[['male','year']]
y=srm['suicide rate']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr= LinearRegression()

In [None]:
lr.fit(X_train,y_train)

In [None]:
pred=lr.predict(X_test)

In [None]:
lr.coef_

In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [None]:
MAE= mean_absolute_error(y_test,pred)
MSE= mean_squared_error(y_test,pred)
RMSE= np.sqrt(mean_squared_error(y_test,pred))
r2= r2_score(y_test,pred)


In [None]:
MAE,MSE,RMSE,r2

In [None]:
coeff= pd.DataFrame(lr.coef_,X.columns,columns=['Coefficient'])

In [None]:
coeff

***r2 score isnt big enough to extract any solid conclusions, but results indicate the following:***

***Being male increases suicide rate by 7.73 points***

***For every year we go back from this dataset (2016,2015,2010,2000) the suicide rate goes up by .087***

## Import new dataset with both sexes to geoplot.

In [None]:
srb=pd.read_csv("/kaggle/input/mental-health-and-suicide-rates/Age-standardized suicide rates.csv")
srb.set_index(srb['Sex'],inplace=True)
srb.drop(' Male',axis=0,inplace=True)
srb.drop(' Female',axis=0,inplace=True)
srb.drop('Sex',axis=1,inplace=True)
srb.reset_index(inplace=True)

In [None]:
srb= srb[['Country','Sex','2000','2010','2015','2016']]

In [None]:
srb.head()

In [None]:
srb2016p=srb.drop(columns=['2015','2010','2000'])
srb2016=pd.melt(srb2016p,id_vars=['Country','Sex'],var_name='year',value_name='suicide rate')

srb2000p=srb.drop(columns=['2016','2015','2010'])
srb2000=pd.melt(srb2000p,id_vars=['Country','Sex'],var_name='year',value_name='suicide rate')

In [None]:
cloro= dict(type='choropleth',
            locations=srb2016['Country'],
            locationmode='country names',
            z=srb2016['suicide rate'],
            text=srb2016['Country'],
            colorscale='Oranges_r',
            reversescale=True,
            colorbar={'title':'suicide rate per 100.000 inhabitants'})

In [None]:
layout= dict(title= 'Suicide rate in 2016',
             geo= dict(showframe=True,
                       showcoastlines=True,
                      projection={'type':'miller'}))

In [None]:
choromap3= go.Figure(data=[cloro],layout=layout)

In [None]:
cloro2= dict(type='choropleth',
            locations=srb2000['Country'],
            locationmode='country names',
            z=srb2000['suicide rate'],
            text=srb2000['Country'],
            colorscale='Oranges_r',
            reversescale=True,
            autocolorscale=False,
            colorbar={'title':'suicide rate per 100.000 inhabitants'})

In [None]:
layout2= dict(title= 'Suicide rate in 2000',
             geo= dict(showframe=True,
                       showcoastlines=True,
                      projection={'type':'miller'}))

In [None]:
choromap4= go.Figure(data=[cloro2],layout=layout2)

In [None]:
iplot(choromap4) 

In [None]:
iplot(choromap3)

## Import datasets including human resources & facilities. 

In [None]:
fac = pd.read_csv('/kaggle/input/mental-health-and-suicide-rates/Facilities.csv')
hhrr= pd.read_csv('/kaggle/input/mental-health-and-suicide-rates/Human Resources.csv')

In [None]:
fac.head()

In [None]:
fac.isnull().sum()

In [None]:
sns.heatmap(fac.isnull(),cmap='Oranges')

In [None]:
fac.drop(columns=['day _treatment','residential_facilities'],inplace=True)

In [None]:
hhrr.head()

In [None]:
sns.heatmap(hhrr.isnull(),cmap='Oranges')

In [None]:
res= pd.merge(fac,hhrr,on='Country')

In [None]:
res.columns

In [None]:
res.drop(columns='Year_y',axis=1,inplace=True)

In [None]:
res.rename(columns={'Year_x':'year'},inplace=True)

In [None]:
sns.heatmap(res.isnull(),cmap='Oranges')

In [None]:
res.drop(columns='Social_workers',inplace=True)

In [None]:
res.head()

In [None]:
tomerge= srb.drop(['2000','2010','2015'],axis=1)

In [None]:
tomerge['Sex'].value_counts()

In [None]:
tomerge.drop('Sex',axis=1,inplace=True)

In [None]:
tomerge.rename({'2016':'suicide_rate'},axis=1,inplace=True)

In [None]:
tomerge

In [None]:
full= pd.merge(res,tomerge,on='Country')

In [None]:
full.drop('year',axis=1,inplace=True)

In [None]:
full

In [None]:
sns.heatmap(full.corr(),annot=True,cmap='Oranges')

In [None]:
sns.regplot(x='Psychiatrists',y='suicide_rate',color='orange', data=full)

In [None]:
full.head()

## Regression ##

In [None]:
sns.heatmap(full.isnull(),cmap='Oranges')

In [None]:
ml=full.drop('Country',axis= 1)

In [None]:
ml=ml.apply(lambda x: x.fillna(x.mean()))

In [None]:
sns.heatmap(ml.isnull(),cmap='Oranges')

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [None]:
X= ml.drop('suicide_rate',axis=1)
y= ml['suicide_rate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
lr= LinearRegression()
lr.fit(X_train,y_train)
pred= lr.predict(X_test)

In [None]:
def eva(y_test,pred):
    print('MAE=',(mean_absolute_error(y_test,pred)))
    print('MSE=',(mean_squared_error(y_test,pred)))
    print('Determination coeff. =', (r2_score(y_test,pred)))

In [None]:
eva(y_test,pred)

In [None]:
lr.coef_

In [None]:
finalcoeff=pd.DataFrame(lr.coef_, X.columns,columns=['Coeff']).sort_values('Coeff',ascending=False)

In [None]:
finalcoeff

In [None]:
sns.heatmap(finalcoeff,cmap='Oranges')

In [None]:
plt.figure(figsize=(12,6))
plt.ylim(0,0.6)
sns.regplot(x='suicide_rate',y='Mental _hospitals',color='orange',data=ml)


In [None]:
plt.figure(figsize=(12,6))
plt.ylim(0,20)
sns.regplot(x='suicide_rate',y='Psychiatrists',color='orange',data=ml)

##### Having a .12 r2 score means we can't extract any solid conclusions from our data, but our regression analysis suggests that having a higher rate of psychiatrists and mental hospitals MIGHT increase suicide rates ####

# Top 10 highest and lowest scores

In [None]:
highest=full.sort_values('suicide_rate',ascending=False)
highest.reset_index(inplace=True)
highest.drop('index',axis=1,inplace=True)

lowest=full.sort_values('suicide_rate',ascending=True)
lowest.reset_index(inplace=True)
lowest.drop('index',axis=1,inplace=True)

In [None]:
hhrr= highest[['Country','Psychiatrists','Nurses','Psychologists','suicide_rate']]
facilities= highest.drop(['Psychiatrists','Nurses','Psychologists'],axis=1)

hhrr2= lowest[['Country','Psychiatrists','Nurses','Psychologists','suicide_rate']]
facilities2=lowest.drop(['Psychiatrists','Nurses','Psychologists'],axis=1)


In [None]:
plt.figure(figsize=(12,6))
plt.title('Facility comparison: Highest suicide rate countries vs lowest')
facilities.head(10)[['Mental _hospitals','health_units','outpatient _facilities']].mean().plot(linestyle='dashed',color='blue',marker='o',markersize=10,label='Highest Suicide Rate')
facilities2.head(10)[['Mental _hospitals','health_units','outpatient _facilities']].mean().plot(color='orange',marker='o',markersize=10,label='Lowest Suicide Rate')
plt.legend(loc="upper left")

In [None]:
plt.figure(figsize=(12,6))
plt.title('Human resource comparison: Highest suicide rate countries vs lowest')
hhrr.head(10)[['Psychiatrists', 'Nurses', 'Psychologists']].mean().plot(linestyle='dashed',color='blue',marker='o',markersize=10,label='Highest Suicide Rate')
hhrr2.head(10)[['Psychiatrists', 'Nurses', 'Psychologists']].mean().plot(color='orange',marker='o',markersize=10,label='Lowest Suicide Rate')
plt.legend(loc="upper left")

*Geographical location for the top 10 countries with the **highest** suicide rate: **South America**: 1, **Africa**: 4 ,**Eastern Europe**: 3, **Asia**: 2 (1 northern asia, 1 south asia).*

*Geographical location for the top 10 countries with the **lowest** suicide rate: **South America**: 3 (2 Caribbean, 1 mainland), **Middle East**: 4 , **Africa**: 2 , **Central Asia**: 1.*

***The top countries with a lower suicide rate showed a higher rate of psychologists & outpatient facilities, as well as a fewer amount of psychiatrists, but the final analysis is inconclusive, as our full analysis wasn't capable of explaining the suicide rate variable, some reasons might be: Absence of variables that might've been important (socioeconomic, culture related, etc.), bigger sample size & missing data.*** 