# Analysis of Suicide rate from 1985 to 2016

In [1]:
# import libraries

import pandas as pd

In [2]:
# loading dataset as dataframe
df = pd.read_csv('suicide_analysis.csv')

In [3]:
# returning first five rows of the dataframe
df.head()

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers


In [4]:
# display name of the colunms with their data types and whether they have missing value or not 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27820 entries, 0 to 27819
Data columns (total 12 columns):
country               27820 non-null object
year                  27820 non-null int64
sex                   27820 non-null object
age                   27820 non-null object
suicides_no           27820 non-null int64
population            27820 non-null int64
suicides/100k pop     27820 non-null float64
country-year          27820 non-null object
HDI for year          8364 non-null float64
 gdp_for_year ($)     27820 non-null object
gdp_per_capita ($)    27820 non-null int64
generation            27820 non-null object
dtypes: float64(2), int64(4), object(6)
memory usage: 2.5+ MB


In [5]:
# it returns the shape of the data frame in form of tuple that holds no. of rows followed by no.of columns
df.shape

(27820, 12)

In [6]:
# it returns the name of columns
df.columns

Index(['country', 'year', 'sex', 'age', 'suicides_no', 'population',
       'suicides/100k pop', 'country-year', 'HDI for year',
       ' gdp_for_year ($) ', 'gdp_per_capita ($)', 'generation'],
      dtype='object')

In [7]:
# converting the dtype of sex from object to category
df['sex'] = df['sex'].astype('category')

In [8]:
# renaming the column names
df = df.rename(columns = {'suicides_no' : 'no_of_suicides', 'suicides/100k pop' : 'suicides_by_100k_pop', 
                     'HDI for year' : 'HDI_for_year', ' gdp_for_year ($) ' : 'gdp_for_year',
                     'gdp_per_capita ($)' : 'gdp_per_capita'
                    })

In [9]:
# converting the dtype of gdp_for_year from object to int
df['gdp_for_year'] = df['gdp_for_year'].str.replace(',','')
df['gdp_for_year'] = pd.to_numeric(df['gdp_for_year'])

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27820 entries, 0 to 27819
Data columns (total 12 columns):
country                 27820 non-null object
year                    27820 non-null int64
sex                     27820 non-null category
age                     27820 non-null object
no_of_suicides          27820 non-null int64
population              27820 non-null int64
suicides_by_100k_pop    27820 non-null float64
country-year            27820 non-null object
HDI_for_year            8364 non-null float64
gdp_for_year            27820 non-null int64
gdp_per_capita          27820 non-null int64
generation              27820 non-null object
dtypes: category(1), float64(2), int64(5), object(4)
memory usage: 2.4+ MB


In [11]:
# filling the NAN values of HDI_for_year with 0
df['HDI_for_year'] = df['HDI_for_year'].fillna(0)

In [12]:
# It returns the distinct value of countries 
df.country.unique()

array(['Albania', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba',
       'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Barbados', 'Belarus', 'Belgium', 'Belize',
       'Bosnia and Herzegovina', 'Brazil', 'Bulgaria', 'Cabo Verde',
       'Canada', 'Chile', 'Colombia', 'Costa Rica', 'Croatia', 'Cuba',
       'Cyprus', 'Czech Republic', 'Denmark', 'Dominica', 'Ecuador',
       'El Salvador', 'Estonia', 'Fiji', 'Finland', 'France', 'Georgia',
       'Germany', 'Greece', 'Grenada', 'Guatemala', 'Guyana', 'Hungary',
       'Iceland', 'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan',
       'Kazakhstan', 'Kiribati', 'Kuwait', 'Kyrgyzstan', 'Latvia',
       'Lithuania', 'Luxembourg', 'Macau', 'Maldives', 'Malta',
       'Mauritius', 'Mexico', 'Mongolia', 'Montenegro', 'Netherlands',
       'New Zealand', 'Nicaragua', 'Norway', 'Oman', 'Panama', 'Paraguay',
       'Philippines', 'Poland', 'Portugal', 'Puerto Rico', 'Qatar',
       'Republic of Korea', 'Romania', '

In [13]:
df.age.unique()

array(['15-24 years', '35-54 years', '75+ years', '25-34 years',
       '55-74 years', '5-14 years'], dtype=object)

In [14]:
# droping the country-year column because the dataframe already having country and year column
df = df.drop(['country-year'],axis=1)

In [15]:
df.generation.unique()

array(['Generation X', 'Silent', 'G.I. Generation', 'Boomers',
       'Millenials', 'Generation Z'], dtype=object)

In [16]:
# grouping by sex column and selecting the no_of_suicide column and calculating mean 
# by seeing this we can analyse that male has highest no. of suicide
df.groupby('sex')['no_of_suicides'].mean()

sex
female    112.114306
male      373.034508
Name: no_of_suicides, dtype: float64

In [17]:
# grouping by age column and selecting the no_of_suicide column and calculating mean 
# by seeing this we can analyse that 35-54 years age group  has highest no. of suicide
df.groupby('age')['no_of_suicides'].mean()

age
15-24 years    174.179664
25-34 years    242.118053
35-54 years    528.250969
5-14 years      11.337093
55-74 years    357.269065
75+ years      140.697544
Name: no_of_suicides, dtype: float64

In [18]:
# no_of_suicide by age and generation
# grouping by age and generation columns and selecting no_of_suicide column and calculating sum, to analyse how age and 
# generation are related to each other and calculating the no. of suicide related to both of the columns.
df.groupby(['age','generation'])['no_of_suicides'].sum()

age          generation     
15-24 years  Generation X        394312
             Millenials          414230
25-34 years  Boomers             304163
             Generation X        640551
             Millenials          179198
35-54 years  Boomers            1689019
             Generation X        491614
             Silent              271508
5-14 years   Generation X          6327
             Generation Z         15906
             Millenials           30031
55-74 years  Boomers             291316
             G.I. Generation     208608
             Silent             1158519
75+ years    G.I. Generation     301401
             Silent              351717
Name: no_of_suicides, dtype: int64

In [19]:
# no_of_suicide by sex and generation
# grouping by sex and generation columns and selecting no_of_suicides column and calculating sum 
# by seeing this, we can analyse that female has highest no. of suicide in Slient generation and male has highest no. of 
# suicide in Boomers generation.
df.groupby(['sex','generation'])['no_of_suicides'].sum()

sex     generation     
female  Boomers             460968
        G.I. Generation     176653
        Generation X        309839
        Generation Z          6141
        Millenials          133620
        Silent              472289
male    Boomers            1823530
        G.I. Generation     333356
        Generation X       1222965
        Generation Z          9765
        Millenials          489839
        Silent             1309455
Name: no_of_suicides, dtype: int64

In [20]:
# country wise no. of suicides
# by seeing this, we can analyse overall Russian Federation country has highest no. of suicide rate.
highest_no_of_suicide = df.groupby('country')['no_of_suicides'].mean()
highest_no_of_suicide = highest_no_of_suicide.sort_values(ascending = False)
highest_no_of_suicide.head()


country
Russian Federation    3733.771605
United States         2779.604839
Japan                 2169.091398
Ukraine                952.232143
Germany                933.532051
Name: no_of_suicides, dtype: float64

In [21]:
# country wise no. of suicide in the year of 1985 
# it shows that in the year of 1985, United States has highest no of suicide.
country_highest_suicide_1985 = df[(df['year'] == 1985)]
country_highest_suicide_1985 = country_highest_suicide_1985.groupby('country')['no_of_suicides'].mean()
country_highest_suicide_1985 = country_highest_suicide_1985.sort_values(ascending = False)
print(country_highest_suicide_1985.head())
print()
print(country_highest_suicide_1985.tail())

country
United States     2453.833333
Japan             1938.083333
France            1041.750000
Sri Lanka          472.333333
United Kingdom     425.416667
Name: no_of_suicides, dtype: float64

country
Saint Vincent and Grenadines    0.166667
Grenada                         0.083333
Bahamas                         0.083333
Dominica                        0.000000
Antigua and Barbuda             0.000000
Name: no_of_suicides, dtype: float64


In [22]:
# country wise no. of suicide in the year of 2016 
# it shows that in the year of 2016, Thailand has highest no of suicide.
country_highest_suicide_2016 = df[(df['year'] == 2016)]
country_highest_suicide_2016 = country_highest_suicide_2016.groupby('country')['no_of_suicides'].mean()
country_highest_suicide_2016 = country_highest_suicide_2016.sort_values(ascending = False)
print(country_highest_suicide_2016.head())
print()
print(country_highest_suicide_2016.tail())

country
Thailand          411.7
Romania           195.3
Netherlands       188.6
Hungary           176.1
Czech Republic    131.8
Name: no_of_suicides, dtype: float64

country
Qatar      6.8
Armenia    6.7
Iceland    4.0
Cyprus     3.6
Grenada    0.0
Name: no_of_suicides, dtype: float64


In [23]:
# year wise no. of suicide
# It shows, in the year of 1999, the no. of suicide was highest and in the year of 2016, the no. of suicide was lowest. 
year_wise_no_of_suicide = df.groupby('year')['no_of_suicides'].sum()
year_wise_no_of_suicide = year_wise_no_of_suicide.sort_values(ascending=False)
print(year_wise_no_of_suicide.head())
print()
print(year_wise_no_of_suicide.tail())

year
1999    256119
2002    256095
2003    256079
2000    255832
2001    250652
Name: no_of_suicides, dtype: int64

year
1987    126842
1988    121026
1986    120670
1985    116063
2016     15603
Name: no_of_suicides, dtype: int64


##  To Be Continued.....