<h1 align="center"> Suicide Rates Overview 1985 to 2016 </h1>
<img src="https://pulitzercenter.org/sites/default/files/styles/project_hero_768_x_480/public/epstein_1-101019.jpg?itok=YLWiDcc5" width=1000 hight=600><br>
<p>This compiled dataset pulled from four other datasets linked by time and place, and was built to find signals correlated to increased suicide rates among different cohorts globally, across the socio-economic spectrum.</p><br>
<b>Contents:</b>
<li>In this notebook we are going to do some analysis, data visualization, data cleaning and build some clustering models for <a href='https://www.kaggle.com/russellyates88/suicide-rates-overview-1985-to-2016'><b>Suicide Rates</b></a></li><br>
<b>You can also see:</b><br>
<li><b><a href='https://www.kaggle.com/alaasedeeq/superstore-analysis-with-cufflinks-and-pandas'>Superstore Analysis With Cufflinks and pandas</a>
<li><b><a href='https://www.kaggle.com/alaasedeeq/superstore-data-analysis-with-plotly'>Superstore Analysis With Plotly
</a>
<li><b><a href='https://www.kaggle.com/alaasedeeq/european-soccer-database-with-sqlite3'>European soccer database with sqlite3</a>

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly 
import plotly.graph_objs as go
import plotly.express as px
import cufflinks as cf
from plotly.offline import iplot,init_notebook_mode
# You can go offline on demand by using
cf.go_offline() 
# To connect java script to your notebook
init_notebook_mode(connected=True)

In [None]:
df = pd.read_csv('../input/suicide-rates-overview-1985-to-2016/master.csv')
df.head()

In [None]:
original_df = df.copy()
df.rename(columns={" gdp_for_year ($) ":"gdp_for_year",
                   "gdp_per_capita ($)":"gdp_per_capita"}, inplace=True)
df.head()

In [None]:
df.info()

In [None]:
df[['suicides_no','suicides/100k pop','HDI for year','gdp_per_capita','gdp_for_year']].describe().\
style.background_gradient(cmap=sns.light_palette('blue', as_cmap=True))

In [None]:
df.isnull().sum()

In [None]:
del df['HDI for year']
del df['country-year']

In [None]:
df.info()

In [None]:
# df["year"] = pd.to_datetime(df["year"], format = "%Y")

In [None]:
corr = df.corr()
corr.iplot(kind='heatmap',colorscale='Reds',hoverinfo='all',
           layout = go.Layout(title='Correlation Heatmap',titlefont=dict(size=20)))

# Data Distribution

In [None]:
data = df[['suicides_no','population','suicides/100k pop','gdp_for_year','gdp_per_capita']]
data.iplot(kind='box',legend=True,mean=True)

In [None]:
data = df[['suicides_no','population','suicides/100k pop','gdp_for_year','gdp_per_capita']]
data.iplot(kind='hist',
           subplots=True,
           horizontal_spacing=.1,
           fill=True,
           subplot_titles=True,
           title='Data Distribution')

# Let's deal with the outliers

In [None]:
#Removing outlier 
def outliers_detection(df, columns):
    outliers_indices = []
    for column in columns:
        Q1 = np.percentile(df[column],25)
        Q3 = np.percentile(df[column],75)
        IQR = Q3-Q1 
        threshold = IQR * 1.5
        lower, upper = Q1 - threshold, Q3 + threshold
        outliers = df[(df[column]<lower)|(df[column]>upper)].index
        outliers_indices.extend(outliers)
    outlier_indices=Counter(outliers_indices)
    multiple_outliers=list(i for i,v in outlier_indices.items() if v>1)
    data = df[~df.index.isin(multiple_outliers)][columns[0]]
    return data

In [None]:
data_1 = df[['population','gdp_per_capita','suicides/100k pop']].rename(columns=lambda x:x+' with_outliers')
data_2 = outliers_detection(df,[['population','gdp_per_capita','suicides/100k pop']])
data_2 = data_2[['population','gdp_per_capita','suicides/100k pop']].rename(columns=lambda x:x+' without_outliers')

data_1.iplot(kind='hist',
           subplots=True,
           horizontal_spacing=.1,
           fill=True,
           subplot_titles=True,
           title='Data Distribution With Outliers')

data_2.iplot(kind='hist',
           subplots=True,
           horizontal_spacing=.1,
           fill=True,
           subplot_titles=True,
           title='Data Distribution Without Outliers')

In [None]:
## Suicides number by year 
year_suicides = df.groupby('year')[['suicides_no']].sum().reset_index()
year_suicides.sort_values(by='suicides_no', ascending=False).style.background_gradient(cmap='Blues', subset=['suicides_no'])

# Suicides by Country

In [None]:
#Grouping the data by Country (Top 10 'suicides/100k population' Countries )
df[['country','suicides_no','population','suicides/100k pop','gdp_per_capita']].groupby('country').sum().\
sort_values('suicides/100k pop',ascending=False)[0:10].style.background_gradient(cmap='Blues')

In [None]:
#Suicides Number Over the time
fig = px.choropleth(df, locations='country',
                   locationmode='country names',color=np.log(df['suicides_no']),
                   animation_frame=df['year'],
                   title='Suicides Number over the Time',color_continuous_scale='matter') #px.colors.sequential.matter

fig.show()

In [None]:
#Suicides Number Over the time
fig = px.choropleth(df, locations='country',
                   locationmode='country names',color=np.log(df['suicides/100k pop']),
                   animation_frame=df['year'],
                   title='suicides/100k popilation over the Time',color_continuous_scale='matter') #px.colors.sequential.matter

fig.show()

In [None]:
temp = df.groupby('country').sum().sort_values(by='suicides_no',ascending=True)[:20]['suicides_no'].reset_index()

data = px.bar(y=temp['country'],x=temp['suicides_no'], orientation='h')

data.update_layout(title='Top Ten Countries Suicides Number',
                   titlefont=(dict(size=20)),
                  yaxis=dict(title='Country Name',titlefont=dict(size=18)),
                  xaxis=dict(title='Suicide Number',titlefont=dict(size=18)))
iplot(dict(data=data))

In [None]:
temp = df.groupby('country').mean().sort_values(by='suicides/100k pop',ascending=True)[:20]['suicides/100k pop'].reset_index()

data = px.bar(y=temp['country'],x=temp['suicides/100k pop'],orientation='h')

data.update_layout(title='Top Ten Countries with for suicides/100k population',
                   titlefont=(dict(size=20)),
                  yaxis=dict(title='Country Name',titlefont=dict(size=18)),
                  xaxis=dict(title='Suicide Number',titlefont=dict(size=18)))
iplot(dict(data=data))

In [None]:
temp = df.groupby('country').sum().sort_values(by='suicides_no',ascending=False)['suicides_no'].reset_index()
temp.iplot(kind='line',x='country',y='suicides_no',title='Countries Total Suicides Number')

# Suicides number by years

In [None]:
temp = df[['suicides_no']].groupby(df.year).sum().reset_index()
temp.iplot(x='year',
           y='suicides_no',width=2.0,
           xTitle='Year', yTitle='Suicides Number', 
           title='Suicides Number Over Years')

In [None]:
temp = df.groupby('year').sum()
temp = temp.sort_values('suicides_no',ascending=False)

data = go.Bar(x=temp.index,y=temp['suicides_no'])
layout = go.Layout(title='Suicides Number Per Year',
                  titlefont=(dict(size=25)),
                  xaxis=dict(tickfont=dict(family = 'Old Standard TT, serif',size = 14,color = 'black'),
                             tickangle=315,
                             title='Year',
                             titlefont=dict(size=18), 
                             tickmode = 'linear'),
                  yaxis=dict(tickfont=dict(family = 'Old Standard TT, serif',
                                           size = 14,color = 'black'),
                             title='Suicide Number',titlefont=dict(size=18)))
iplot(dict(data=data,layout=layout))

In [None]:
data = df.groupby('year').mean()

data = go.Bar(x=data.index,y=data['suicides_no'])

layout = go.Layout(title='Mean Suicides Number per Year',
                  titlefont=(dict(size=25)),
                  xaxis=dict(tickfont=dict(family = 'Old Standard TT, serif',
                                           size = 14,color = 'black'),
                             tickangle=315,
                             title='Year',titlefont=dict(size=18), tickmode = 'linear'),
                  yaxis=dict(tickfont=dict(family = 'Old Standard TT, serif',
                                           size = 14,color = 'black'),
                             title='Suicides',titlefont=dict(size=18)))

iplot(dict(data=data,layout=layout))

In [None]:
data = df.groupby('year').mean()

data = go.Bar(x=data.index,y=data['suicides/100k pop'])

layout = go.Layout(title='suicides/100k population per Year',
                  titlefont=(dict(size=25)),
                  xaxis=dict(tickfont=dict(family = 'Old Standard TT, serif',
                                           size = 14,color = 'black'),
                             tickangle=315,
                             title='Year',titlefont=dict(size=18), tickmode = 'linear'),
                  yaxis=dict(tickfont=dict(family = 'Old Standard TT, serif',
                                           size = 14,color = 'black'),
                             title='Suicides',titlefont=dict(size=18)))

iplot(dict(data=data,layout=layout))

><b>Suicide at most in 1995</b>

# Suicides by Gender

In [None]:
#Grouping the data by Country (Top 10 'suicides/100k population' Countries )
df[['sex','suicides_no','population','suicides/100k pop','gdp_per_capita']].groupby('sex').sum().\
sort_values('suicides/100k pop',ascending=False)[0:10].style.background_gradient(cmap='Blues')

In [None]:
# Suicides Number for each gender per year
sex = df['sex'].unique()
d1 = df[df['sex']==sex[0]][['year','suicides_no']].rename(columns={'suicides_no':sex[0]}).groupby('year').sum().reset_index()
d2 = df[df['sex']==sex[1]][['year','suicides_no']].rename(columns={'suicides_no':sex[1]}).groupby('year').sum().reset_index()
d1.merge(d2,on='year').set_index('year').style.background_gradient('Blues')

In [None]:
# Suicides/100k population Number for each gender per year
sex = df['sex'].unique()
d1 = df[df['sex']==sex[0]][['year','suicides/100k pop']].rename(columns={'suicides/100k pop':sex[0]}).groupby('year').sum().reset_index()
d2 = df[df['sex']==sex[1]][['year','suicides/100k pop']].rename(columns={'suicides/100k pop':sex[1]}).groupby('year').sum().reset_index()
d1.merge(d2,on='year').set_index('year').style.background_gradient('Blues')

In [None]:
data = df.groupby(['year','sex','age']).sum().reset_index()
fig = px.bar(data_frame=data ,x='sex', y='suicides_no', color='age', barmode='group' ,opacity=1)
fig.update_layout(title='Bar Plots for Suicides Number for each gender grouped by age',
                   titlefont=(dict(size=20)),
                  yaxis=dict(title='Suicides Number',titlefont=dict(size=18)),
                  xaxis=dict(title='Gender',titlefont=dict(size=18)))
iplot(dict(data=fig,layout=layout))

In [None]:
data = df.groupby('sex').sum().reset_index()
fig = plotly.subplots.make_subplots(rows=1, cols=2,
                                    specs=[[{"type": "bar"}, {"type": "pie"}]],
                                    subplot_titles=['Bar Polar Plot for the suicides number of each gender',
                                                    'Pie Plot for the suicides number of each gender'])

fig = px.bar(data_frame=data ,x='sex', y='suicides_no', barmode='group' ,opacity=1)
fig.update_layout(title='Top Ten Countries Suicides Number',
                   titlefont=(dict(size=20)),
                   yaxis=dict(title='Suicides Number',titlefont=dict(size=18)),
                   xaxis=dict(title='Gender',titlefont=dict(size=18)))

fig.show()

In [None]:
male = df[df['sex']=='male'].groupby('year').sum().reset_index().rename(columns={'suicides/100k pop':'Male'})
female = df[df['sex']=='female'].groupby('year').sum().reset_index().rename(columns={'suicides/100k pop':'Female'})
temp = pd.merge(male,female,on='year')

layout = cf.tools.getLayout({'Axis Ranges':{'xrange' : df['year'].unique()}},
                           title='Gender Suicides over Years')

temp.iplot(kind='ratio',y=['Male','Female'],x='year',title='suicides/100k population by gender',layout=layout)

In [None]:
male = df[df['sex']=='male'].groupby('year').sum().reset_index()
female = df[df['sex']=='female'].groupby('year').sum().reset_index()

data = [go.Bar(x=male['year'],y=male['suicides_no'],name='Male Suicides'),
        go.Bar(x=female['year'],y=female['suicides_no'],name='Female Suicides')]

layout = go.Layout(title='Suicides For Each Year By Gender',
                   titlefont=dict(size=20),
                   xaxis=dict(tickfont=dict(family = 'Old Standard TT, serif',size = 14,color = 'black'),
                              tickangle=315,title='Year',titlefont=dict(size=18), tickmode = 'linear'))

iplot(dict(data=data,layout=layout))

# Suicides by Generation

In [None]:
#Grouping the data by Country (Top 10 'suicides/100k population' Countries )
df[['generation','suicides_no','population','suicides/100k pop','gdp_per_capita']].groupby('generation').sum().\
sort_values('suicides/100k pop',ascending=False)[0:10].style.background_gradient(cmap='Blues')

In [None]:
# Suicides Number for each generation per year
gen = df['generation']
d1 = df[df['generation']==gen[0]][['year','suicides_no']].rename(columns={'suicides_no':gen[0]}).groupby('year').sum().reset_index()
d2 = df[df['generation']==gen[1]][['year','suicides_no']].rename(columns={'suicides_no':gen[1]}).groupby('year').sum().reset_index()
d3 = df[df['generation']==gen[2]][['year','suicides_no']].rename(columns={'suicides_no':gen[2]}).groupby('year').sum().reset_index()
d4 = df[df['generation']==gen[3]][['year','suicides_no']].rename(columns={'suicides_no':gen[3]}).groupby('year').sum().reset_index()
d5 = df[df['generation']==gen[4]][['year','suicides_no']].rename(columns={'suicides_no':gen[4]}).groupby('year').sum().reset_index()
d6 = df[df['generation']==gen[5]][['year','suicides_no']].rename(columns={'suicides_no':gen[5]}).groupby('year').sum().reset_index()
d1.merge(d2.merge(d3.merge(d4.merge(d5.merge(d6,on='year'),on='year'),on='year'),on='year'),on='year').set_index('year').style.background_gradient('Blues')

In [None]:
# Suicides Number for each generation per year
gen = df['generation']
d1 = df[df['generation']==gen[0]][['year','suicides/100k pop']].rename(columns={'suicides_no':gen[0]}).groupby('year').sum().reset_index()
d2 = df[df['generation']==gen[1]][['year','suicides/100k pop']].rename(columns={'suicides/100k pop':gen[1]}).groupby('year').sum().reset_index()
d3 = df[df['generation']==gen[2]][['year','suicides/100k pop']].rename(columns={'suicides/100k pop':gen[2]}).groupby('year').sum().reset_index()
d4 = df[df['generation']==gen[3]][['year','suicides/100k pop']].rename(columns={'suicides/100k pop':gen[3]}).groupby('year').sum().reset_index()
d5 = df[df['generation']==gen[4]][['year','suicides/100k pop']].rename(columns={'suicides/100k pop':gen[4]}).groupby('year').sum().reset_index()
d6 = df[df['generation']==gen[5]][['year','suicides/100k pop']].rename(columns={'suicides/100k pop':gen[5]}).groupby('year').sum().reset_index()
d1.merge(d2.merge(d3.merge(d4.merge(d5.merge(d6,on='year'),on='year'),on='year'),on='year'),on='year').set_index('year').style.background_gradient('Blues')

In [None]:
data = df.groupby('generation').sum().reset_index()
layout = cf.tools.getLayout(height=600,width=1000,
                            title='Pie Plot for Sucides number by Gender')
data.iplot(kind='pie',labels='generation',
           textinfo='label+percent',
           world_readable=True,hole=.4,
           values='suicides_no',
           layout=layout)

In [None]:
temp = df[['suicides_no','suicides/100k pop']].groupby(df.generation).sum().reset_index().sort_values(by='suicides_no',ascending=False)

data = [go.Bar(x=temp['generation'], y=temp['suicides_no'],name='Suicides Number'),
        go.Bar(x=temp['generation'], y=temp['suicides/100k pop'],name='Suicides/100k population')]

layout = go.Layout(title='Suicides and suicides/100k population By Generation',
                   titlefont=dict(size=20),
                   xaxis=dict(tickfont=dict(family = 'Old Standard TT, serif',size = 14,color = 'black'),
                              tickangle=315,title='Year',titlefont=dict(size=18), tickmode = 'linear'))

iplot(dict(data=data,layout=layout))

In [None]:
d1 = df.query('''generation=="Boomers"''')[['year','suicides_no','suicides/100k pop']].rename(columns={'suicides_no':'Boomers'}).groupby('year').sum().reset_index()
d2 = df.query('''generation=="Generation X"''')[['year','suicides_no','suicides/100k pop']].rename(columns={'suicides_no':'Generation X'}).groupby('year').sum().reset_index()
d3 = df.query('''generation=="Silent"''')[['year','suicides_no','suicides/100k pop']].rename(columns={'suicides_no':'Silent'}).groupby('year').sum().reset_index()
d4 = df.query('''generation=="G.I. Generation"''')[['year','suicides_no','suicides/100k pop']].rename(columns={'suicides_no':'G.I. Generation'}).groupby('year').sum().reset_index().reset_index()
d5 = df.query('''generation=="Generation Z"''')[['year','suicides_no','suicides/100k pop']].rename(columns={'suicides_no':'Generation Z'}).groupby('year').sum().reset_index()
d6 = df.query('''generation=="Millenials"''')[['year','suicides_no','suicides/100k pop']].rename(columns={'suicides_no':'Millenials'}).groupby('year').sum().reset_index()

data = []

for d in [d1,d2,d3,d4,d5,d6]:
    data.append(go.Bar(x=d['year'],y=d[d.columns[1]],name=d.columns[1]))

layout = go.Layout(title='Suicides for each year by generation',
                   titlefont=dict(size=20),
                  xaxis=dict(tickfont=dict(family = 'Old Standard TT, serif',size = 14,color = 'black'),
                              tickangle=315,title='Year',titlefont=dict(size=18), tickmode = 'linear'))
    
iplot(dict(data=data,layout=layout))

In [None]:
d1 = df.query('''generation=="Boomers"''')[['year','suicides_no','suicides/100k pop']].rename(columns={'suicides_no':'Boomers','suicides/100k pop':'boomers'}).groupby('year').sum().reset_index()
d2 = df.query('''generation=="Generation X"''')[['year','suicides_no','suicides/100k pop']].rename(columns={'suicides_no':'Generation X','suicides/100k pop':'Generation_X'}).groupby('year').sum().reset_index()
d3 = df.query('''generation=="Silent"''')[['year','suicides_no','suicides/100k pop']].rename(columns={'suicides_no':'Silent','suicides/100k pop':'silent'}).groupby('year').sum().reset_index()
d4 = df.query('''generation=="G.I. Generation"''')[['year','suicides_no','suicides/100k pop']].rename(columns={'suicides_no':'G.I. Generation','suicides/100k pop':'G.I.Generation'}).groupby('year').sum().reset_index().reset_index()
d5 = df.query('''generation=="Generation Z"''')[['year','suicides_no','suicides/100k pop']].rename(columns={'suicides_no':'Generation Z','suicides/100k pop':'Generation_Z'}).groupby('year').sum().reset_index()
d6 = df.query('''generation=="Millenials"''')[['year','suicides_no','suicides/100k pop']].rename(columns={'suicides_no':'Millenials','suicides/100k pop':'millenials'}).groupby('year').sum().reset_index()

data = []

for d in [d1,d2,d3,d4,d5,d6]:
    data.append(go.Bar(x=d['year'],y=d[d.columns[2]],name=d.columns[2]))

layout = go.Layout(title='suicides/100k population for each year by generation',
                   titlefont=dict(size=20),
                  xaxis=dict(tickfont=dict(family = 'Old Standard TT, serif',size = 14,color = 'black'),
                              tickangle=315,title='Year',titlefont=dict(size=18), tickmode = 'linear'))
    
iplot(dict(data=data,layout=layout))

In [None]:
d1 = df.query('''generation=="Boomers"''')[['year','suicides_no',]].rename(columns={'suicides_no':'Boomers'}).groupby('year').sum().reset_index()
d2 = df.query('''generation=="Generation X"''')[['year','suicides_no']].rename(columns={'suicides_no':'Generation X'}).groupby('year').sum().reset_index()
d3 = df.query('''generation=="Silent"''')[['year','suicides_no']].rename(columns={'suicides_no':'Silent'}).groupby('year').sum().reset_index()
d4 = df.query('''generation=="G.I. Generation"''')[['year','suicides_no']].rename(columns={'suicides_no':'G.I. Generation'}).groupby('year').sum().reset_index()
d5 = df.query('''generation=="Generation Z"''')[['year','suicides_no']].rename(columns={'suicides_no':'Generation Z'}).groupby('year').sum().reset_index()
d6 = df.query('''generation=="Millenials"''')[['year','suicides_no']].rename(columns={'suicides_no':'Millenials'}).groupby('year').sum().reset_index()

data = []

for d in [d1,d2,d3,d4,d5,d6]:
    data.append(go.Scatter(x=d['year'],y=d[d.columns[1]],name=d.columns[1]))

layout = go.Layout(title='Suicides for each year by generation',
                   titlefont=dict(size=20),
                  xaxis=dict(tickfont=dict(family = 'Old Standard TT, serif',size = 14,color = 'black'),
                              tickangle=315,title='Year',titlefont=dict(size=18), tickmode = 'linear'))
    
iplot(dict(data=data,layout=layout))

In [None]:
d1 = df.query('''generation=="Boomers"''')[['year','suicides/100k pop']].rename(columns={'suicides/100k pop':'boomers'}).groupby('year').sum().reset_index()
d2 = df.query('''generation=="Generation X"''')[['year','suicides/100k pop']].rename(columns={'suicides/100k pop':'Generation_X'}).groupby('year').sum().reset_index()
d3 = df.query('''generation=="Silent"''')[['year','suicides/100k pop']].rename(columns={'suicides/100k pop':'silent'}).groupby('year').sum().reset_index()
d4 = df.query('''generation=="G.I. Generation"''')[['year','suicides/100k pop']].rename(columns={'suicides/100k pop':'G.I.Generation'}).groupby('year').sum().reset_index()
d5 = df.query('''generation=="Generation Z"''')[['year','suicides/100k pop']].rename(columns={'suicides/100k pop':'Generation_Z'}).groupby('year').sum().reset_index()
d6 = df.query('''generation=="Millenials"''')[['year','suicides/100k pop']].rename(columns={'suicides/100k pop':'millenials'}).groupby('year').sum().reset_index()

data = []

for d in [d1,d2,d3,d4,d5,d6]:
    data.append(go.Scatter(x=d['year'],y=d[d.columns[1]],name=d.columns[1]))

layout = go.Layout(title='Suicides for each year by generation',
                   titlefont=dict(size=20),
                  xaxis=dict(tickfont=dict(family = 'Old Standard TT, serif',size = 14,color = 'black'),
                              tickangle=315,title='Year',titlefont=dict(size=18), tickmode = 'linear'))
    
iplot(dict(data=data,layout=layout))

# Suicides by Age

In [None]:
# Suicides Number for each age per year
age = df['age'].unique()
d1 = df[df['age']==age[0]][['year','suicides_no']].rename(columns={'suicides_no':age[0]}).groupby('year').sum().reset_index()
d2 = df[df['age']==age[1]][['year','suicides_no']].rename(columns={'suicides_no':age[1]}).groupby('year').sum().reset_index()
d3 = df[df['age']==age[2]][['year','suicides_no']].rename(columns={'suicides_no':age[2]}).groupby('year').sum().reset_index()
d4 = df[df['age']==age[3]][['year','suicides_no']].rename(columns={'suicides_no':age[3]}).groupby('year').sum().reset_index()
d5 = df[df['age']==age[4]][['year','suicides_no']].rename(columns={'suicides_no':age[4]}).groupby('year').sum().reset_index()
d6 = df[df['age']==age[5]][['year','suicides_no']].rename(columns={'suicides_no':age[5]}).groupby('year').sum().reset_index()
d1.merge(d2.merge(d3.merge(d4.merge(d5.merge(d6,on='year'),on='year'),on='year'),on='year'),on='year').set_index('year').style.background_gradient('Blues')

In [None]:
# Suicides/100k population Number for each age per year
d1 = df[df['age']==age[0]][['year','suicides/100k pop']].rename(columns={'suicides/100k pop':age[0]}).groupby('year').sum().reset_index()
d2 = df[df['age']==age[1]][['year','suicides/100k pop']].rename(columns={'suicides/100k pop':age[1]}).groupby('year').sum().reset_index()
d3 = df[df['age']==age[2]][['year','suicides/100k pop']].rename(columns={'suicides/100k pop':age[2]}).groupby('year').sum().reset_index()
d4 = df[df['age']==age[3]][['year','suicides/100k pop']].rename(columns={'suicides/100k pop':age[3]}).groupby('year').sum().reset_index()
d5 = df[df['age']==age[4]][['year','suicides/100k pop']].rename(columns={'suicides/100k pop':age[4]}).groupby('year').sum().reset_index()
d6 = df[df['age']==age[5]][['year','suicides/100k pop']].rename(columns={'suicides/100k pop':age[5]}).groupby('year').sum().reset_index()
d1.merge(d2.merge(d3.merge(d4.merge(d5.merge(d6,on='year'),on='year'),on='year'),on='year'),on='year').set_index('year').style.background_gradient('Blues')

In [None]:
data = df.groupby('age').sum().reset_index()
layout = cf.tools.getLayout(height=600,width=1000,
                            title='Pie Plot For World Sucide By Age',
                            titlefont=dict(size=20))
data.iplot(kind='pie',labels='age',
           textinfo='label+percent',
           world_readable=True,hole=.4,
           values='suicides_no',
           layout=layout)

In [None]:
temp = df[['suicides_no','suicides/100k pop']].groupby(df.age).sum().reset_index().sort_values(by='suicides_no',ascending=False)

data = [go.Bar(x=temp['age'], y=temp['suicides_no'],name='Suicides Number'),
        go.Bar(x=temp['age'], y=temp['suicides/100k pop'],name='Suicides/100k population')]

layout = go.Layout(title='Suicides Number and suicides/100k population By Age',
                   titlefont=dict(size=20),
                   xaxis=dict(tickfont=dict(family = 'Old Standard TT, serif',size = 14,color = 'black'),
                              tickangle=315,title='Year',titlefont=dict(size=18), tickmode = 'linear'))

iplot(dict(data=data,layout=layout))

In [None]:
age = df['age'].unique()
d1 = df[df['age']==age[0]][['year','suicides_no']].rename(columns={'suicides_no':age[0]}).groupby('year').sum().reset_index()
d2 = df[df['age']==age[1]][['year','suicides_no']].rename(columns={'suicides_no':age[1]}).groupby('year').sum().reset_index()
d3 = df[df['age']==age[2]][['year','suicides_no']].rename(columns={'suicides_no':age[2]}).groupby('year').sum().reset_index()
d4 = df[df['age']==age[3]][['year','suicides_no']].rename(columns={'suicides_no':age[3]}).groupby('year').sum().reset_index()
d5 = df[df['age']==age[4]][['year','suicides_no']].rename(columns={'suicides_no':age[4]}).groupby('year').sum().reset_index()
d6 = df[df['age']==age[5]][['year','suicides_no']].rename(columns={'suicides_no':age[5]}).groupby('year').sum().reset_index()

data = []

for d in [d1,d2,d3,d4,d5,d6]:
    data.append(go.Bar(x=d['year'],y=d[d.columns[1]],name=d.columns[1]))

layout = go.Layout(title='Suicides for each year by Age',
                   titlefont=dict(size=20),
                  xaxis=dict(tickfont=dict(family = 'Old Standard TT, serif',size = 14,color = 'black'),
                              tickangle=315,title='Year',titlefont=dict(size=18), tickmode = 'linear'))
    
iplot(dict(data=data,layout=layout))

In [None]:
age = df['age'].unique()
d1 = df[df['age']==age[0]][['year','suicides/100k pop']].rename(columns={'suicides/100k pop':age[0]}).groupby('year').sum().reset_index()
d2 = df[df['age']==age[1]][['year','suicides/100k pop']].rename(columns={'suicides/100k pop':age[1]}).groupby('year').sum().reset_index()
d3 = df[df['age']==age[2]][['year','suicides/100k pop']].rename(columns={'suicides/100k pop':age[2]}).groupby('year').sum().reset_index()
d4 = df[df['age']==age[3]][['year','suicides/100k pop']].rename(columns={'suicides/100k pop':age[3]}).groupby('year').sum().reset_index()
d5 = df[df['age']==age[4]][['year','suicides/100k pop']].rename(columns={'suicides/100k pop':age[4]}).groupby('year').sum().reset_index()
d6 = df[df['age']==age[5]][['year','suicides/100k pop']].rename(columns={'suicides/100k pop':age[5]}).groupby('year').sum().reset_index()

data = []

for d in [d1,d2,d3,d4,d5,d6]:
    data.append(go.Bar(x=d['year'],y=d[d.columns[1]],name=d.columns[1]))

layout = go.Layout(title='Suicides/100k population for each year by Age',
                   titlefont=dict(size=20),
                  xaxis=dict(tickfont=dict(family = 'Old Standard TT, serif',size = 14,color = 'black'),
                              tickangle=315,title='Year',titlefont=dict(size=18), tickmode = 'linear'))
    
iplot(dict(data=data,layout=layout))

In [None]:
age = df['age'].unique()
d1 = df[df['age']==age[0]][['year','suicides_no']].rename(columns={'suicides_no':age[0]}).groupby('year').sum().reset_index()
d2 = df[df['age']==age[1]][['year','suicides_no']].rename(columns={'suicides_no':age[1]}).groupby('year').sum().reset_index()
d3 = df[df['age']==age[2]][['year','suicides_no']].rename(columns={'suicides_no':age[2]}).groupby('year').sum().reset_index()
d4 = df[df['age']==age[3]][['year','suicides_no']].rename(columns={'suicides_no':age[3]}).groupby('year').sum().reset_index()
d5 = df[df['age']==age[4]][['year','suicides_no']].rename(columns={'suicides_no':age[4]}).groupby('year').sum().reset_index()
d6 = df[df['age']==age[5]][['year','suicides_no']].rename(columns={'suicides_no':age[5]}).groupby('year').sum().reset_index()

data = []

for d in [d1,d2,d3,d4,d5,d6]:
    data.append(go.Scatter(x=d['year'],y=d[d.columns[1]],name=d.columns[1]))

layout = go.Layout(title='Suicides for each year by Age',
                   titlefont=dict(size=20),
                  xaxis=dict(tickfont=dict(family = 'Old Standard TT, serif',size = 14,color = 'black'),
                              tickangle=315,title='Year',titlefont=dict(size=18), tickmode = 'linear'))
    
iplot(dict(data=data,layout=layout))

In [None]:
age = df['age'].unique()
d1 = df[df['age']==age[0]][['year','suicides/100k pop']].rename(columns={'suicides/100k pop':age[0]}).groupby('year').sum().reset_index()
d2 = df[df['age']==age[1]][['year','suicides/100k pop']].rename(columns={'suicides/100k pop':age[1]}).groupby('year').sum().reset_index()
d3 = df[df['age']==age[2]][['year','suicides/100k pop']].rename(columns={'suicides/100k pop':age[2]}).groupby('year').sum().reset_index()
d4 = df[df['age']==age[3]][['year','suicides/100k pop']].rename(columns={'suicides/100k pop':age[3]}).groupby('year').sum().reset_index()
d5 = df[df['age']==age[4]][['year','suicides/100k pop']].rename(columns={'suicides/100k pop':age[4]}).groupby('year').sum().reset_index()
d6 = df[df['age']==age[5]][['year','suicides/100k pop']].rename(columns={'suicides/100k pop':age[5]}).groupby('year').sum().reset_index()

data = []

for d in [d1,d2,d3,d4,d5,d6]:
    data.append(go.Scatter(x=d['year'],y=d[d.columns[1]],name=d.columns[1]))

layout = go.Layout(title='Suicides/100k population for each year by Age',
                   titlefont=dict(size=20),
                  xaxis=dict(tickfont=dict(family = 'Old Standard TT, serif',size = 14,color = 'black'),
                              tickangle=315,title='Year',titlefont=dict(size=18), tickmode = 'linear'))
    
iplot(dict(data=data,layout=layout))

# Preprocessing the Data

#### <li>Lets see how to do encoding
> <b>First (generation)

In [None]:
df_k = df.copy()

In [None]:
gen_cat = df['generation'].astype('category')
gen_dict = dict(enumerate(gen_cat.cat.categories))
print(gen_dict)

> <b>Second (age)

In [None]:
age_cat = df['age'].astype('category')
age_dict = dict(enumerate(age_cat.cat.categories))
print(age_dict)

> <b>Third (sex)

In [None]:
sex_cat = df['sex'].astype('category')
sex_dict = dict(enumerate(sex_cat.cat.categories))
print(sex_dict)

### Let's do the encoding 

In [None]:
# first we reverse our dictionaries to apply them on the dataframe 

sex_map = {v: k for k, v in sex_dict.items()}
age_map = {v: k for k, v in age_dict.items()}
gen_map = {v: k for k, v in gen_dict.items()}
df_k = df_k.replace({'sex':sex_map,'age':age_map,'generation':gen_map})
df_k

> <b>Let's see the resulst

In [None]:
d = pd.merge(df['age'],df_k.rename(columns={'age':'cat_age'}),on=df.index)[['age','cat_age']]
d.head()

In [None]:
d = pd.merge(df['sex'],df_k.rename(columns={'sex':'cat_sex'}),on=df.index)[['sex','cat_sex']]
d.head()

In [None]:
d = pd.merge(df['generation'],df_k.rename(columns={'generation':'cat_generation'}),on=df.index)[['generation','cat_generation']]
d.head()

#### Before applying k-Means Let's apply the elbow method to know the best number of clusters (K)

In [None]:
df_k = df_k.loc[:, 'year':'generation']
df_k = df_k.dropna()
df_k = df_k[['sex','generation','age']]
df_k['suicides_no'] = df['suicides_no']
df_k['gdp_for_year'] = df['gdp_for_year']
df_k['population'] = df['population']
df_k['suicides/100k pop'] = df['suicides/100k pop']
df_k['gdp_for_year'] = df['gdp_for_year']
df_k['gdp_per_capita'] = df['gdp_per_capita']

In [None]:
from sklearn.cluster import KMeans

k = []
for i in range(1,30):
    kmeans = KMeans(n_clusters = i)
    kmeans.fit(df_k[['sex','generation','age','population','suicides/100k pop','gdp_per_capita']])
    k.append(kmeans.inertia_)

In [None]:
elbow_df = pd.DataFrame(index=range(1,30),data=k).reset_index()
elbow_df = elbow_df.rename(columns={'index':'Number_of_clusters',0:'Sum_of_intra_cluster_squares'})

In [None]:
elbow_df['Sum_of_intra_cluster_squares'].iplot(kind='line',title='Elbow-Method results')

><b>We find that the best number of clusters is 3

### Now let's apply K-Means with (k=3)

In [None]:
from sklearn.cluster import MiniBatchKMeans

kmeans = MiniBatchKMeans(n_clusters=3,
                          random_state=0,
                          batch_size=10)

y_pred = kmeans.fit_predict(df_k[['sex','generation','age','population','suicides/100k pop','gdp_per_capita']])

In [None]:
df_k['k_means_clusters'] = pd.Series(y_pred)
df_k

### Let's Visualize the clusters

In [None]:
px.scatter(data_frame=df_k ,
           x='gdp_per_capita',
           y='suicides_no',
           color='k_means_clusters')

In [None]:
px.scatter(data_frame=df_k ,
           x='gdp_for_year',
           y='suicides_no',
           color='k_means_clusters')

In [None]:
px.scatter(data_frame=df_k,
           x='population',
           y='suicides_no',
           color='k_means_clusters')

In [None]:
px.scatter_3d(data_frame=df_k,
              x='gdp_for_year',
              y='generation',
              z='suicides_no',
              color='k_means_clusters',)

In [None]:
px.scatter_3d(data_frame=df_k,
              x='gdp_per_capita',
              y='gdp_for_year',
              z='population',
              color='k_means_clusters')

In [None]:
px.scatter_3d(data_frame=df_k,
              x='gdp_per_capita',
              y='gdp_for_year',
              z='suicides/100k pop',
              color='k_means_clusters')

# Tanks
## Please Upvote my notebook if you found it useful