In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/dados-de-suicdio-no-brasil-2014-a-2018/datasus_suicidio_2014_2018.csv', engine='python', encoding='ISO-8859-1') 
df.head()

I think it is a good idea to change columns names

In [None]:
df.columns = ['uf', 'ano', 'circ_obito', 'data_obito', 'data_nasc', 'genero', 'raca_cor', 'est_civ', 'esc', 'ocupacao', 'municipio', 'local_obito', 'assist_med', 'cid', 'cid_o', 'idade', 'mes']
df.head()

Check for missing values

In [None]:
round(df.isna().sum()/len(df['ano'])*100, 2)

Drop columns that are not important to analysis

In [None]:
df = df.drop(['circ_obito', 'assist_med', 'ocupacao', 'cid', 'cid_o'], axis=1)
df.isna().sum()

Filling some missing values

In [None]:

df['data_nasc'].fillna(method='bfill', inplace=True)
df['genero'].fillna(method='bfill', inplace=True)
df['raca_cor'].fillna(method='bfill', inplace=True)
df['est_civ'].fillna(method='bfill', inplace=True)
df['esc'].fillna(method='bfill', inplace=True)
df['local_obito'].fillna(method='bfill', inplace=True)
df['idade'].fillna(method='bfill', inplace=True)
df.isna().sum()

It seems to follow the the total population distribution.

In [None]:
plt.figure(figsize=(12, 6))
df['uf'].value_counts().plot.bar()

Ok, it is linear.

In [None]:
plt.figure()
df['ano'].value_counts().sort_index().plot.bar()

If we look uf x ano, it's similar. 

In [None]:
group_counts = df.groupby(['uf', 'ano']).count().reset_index()
table = pd.pivot_table(group_counts, index='uf', columns='ano', values='idade', aggfunc=sum, margins=True)
table = table.sort_values('All', ascending=False)
del table['All']
table = table.drop(['All'], axis=0)
table.plot.bar(figsize=(24, 6))

Checking by age

Most suicides are between 30 and 40 years old

In [None]:
plt.figure(figsize=(12, 6))
df['idade'].hist(density=False, bins=100, range=[0,100])

If we look the race/ethnicity

In [None]:
round(df['raca_cor'].value_counts() / len(df['uf']) * 100, 2)

In Brasil, I think we can mix 'Parda' and 'Preta' as non white people.

It follows the distribution of the population, according to the last census.

In [None]:
df.loc[df['raca_cor'] == 'Parda', 'raca_cor'] = 'Preta'
round(df['raca_cor'].value_counts() / len(df['uf']) * 100, 2)

We have something here!
According to the last census, the female population is slightly higher than the male.

Can I say that men commit more suicide?

In [None]:
# round(df['genero'].value_counts() / len(df['genero']) * 100, 2)
df['genero'].value_counts().plot.pie(autopct='%0.2f%%', figsize=(8, 8))

In all age groups, the incidence of female suicides is below 25%. 

In [None]:
age_groups = pd.cut(df['idade'], bins=[20, 30, 40, 50, 60, 70, 80, 90, 100])
df_age_gender = round(df.groupby(age_groups)['genero'].value_counts() / df.groupby(age_groups)['genero'].count() * 100, 2)
df_age_gender.plot.bar(color=['blue', 'pink'], figsize=(12, 6), stacked=True)

If we look at each state, we do not find a rate of female suicides greater than 30%.
So, we can say that men commit more suicide than women.


In [None]:
df_uf_gender = df.groupby(['uf'])['genero'].value_counts() / df.groupby(['uf'])['genero'].count() * 100
df_uf_gender.plot.bar(color=['blue', 'pink'], figsize=(24, 6))

Lets group by quarter.

We can see we have more sudicides on first and last quarter, every year.

In [None]:
#df['quarter'] = pd.to_datetime(df['mes'].values, format='%m').astype('period[Q]')
df['quarter'] = pd.to_datetime(df['mes'].values, format='%m').map(lambda x: pd.Period(x,'Q'))

group_counts = df.groupby(['ano', 'quarter']).count().reset_index()
table = pd.pivot_table(group_counts, index='ano', columns='quarter', values='idade', aggfunc=sum, margins=True)
del table['All']
table = table.drop(['All'])
table.plot.bar(figsize=(12, 6))


In [None]:
s = df['mes'].value_counts()
s.sort_index().plot.bar(figsize=(12, 6))
