In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import griddata

In [2]:
df = pd.read_csv('../data/master.csv')
df.head()

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers


In [3]:
print(f'The dataset has {df.shape[0]} rows and {df.shape[1]} columns.')

The dataset has 27820 rows and 12 columns.


In [4]:
print('The column names are: ', df.columns.tolist())

The column names are:  ['country', 'year', 'sex', 'age', 'suicides_no', 'population', 'suicides/100k pop', 'country-year', 'HDI for year', ' gdp_for_year ($) ', 'gdp_per_capita ($)', 'generation']


In [5]:
print('The data types of the columns are:\n', df.dtypes)

The data types of the columns are:
 country                object
year                    int64
sex                    object
age                    object
suicides_no             int64
population              int64
suicides/100k pop     float64
country-year           object
HDI for year          float64
 gdp_for_year ($)      object
gdp_per_capita ($)      int64
generation             object
dtype: object


In [6]:
print(f'There are {df.duplicated().sum()} duplicate values in the dataset.')

There are 0 duplicate values in the dataset.


In [7]:
# Checking for missing values
df.isnull().sum()

country                   0
year                      0
sex                       0
age                       0
suicides_no               0
population                0
suicides/100k pop         0
country-year              0
HDI for year          19456
 gdp_for_year ($)         0
gdp_per_capita ($)        0
generation                0
dtype: int64

In [8]:
print('The possible values in the age column are: ', df['age'].unique())
print('The possible values in the sex column are: ', df['sex'].unique())
print('The possible values in the generation column are: ', df['generation'].unique())

The possible values in the age column are:  ['15-24 years' '35-54 years' '75+ years' '25-34 years' '55-74 years'
 '5-14 years']
The possible values in the sex column are:  ['male' 'female']
The possible values in the generation column are:  ['Generation X' 'Silent' 'G.I. Generation' 'Boomers' 'Millenials'
 'Generation Z']


In [9]:
df = df.drop(columns='HDI for year')
# validation
print('Updated columns: ', df.columns.tolist())

Updated columns:  ['country', 'year', 'sex', 'age', 'suicides_no', 'population', 'suicides/100k pop', 'country-year', ' gdp_for_year ($) ', 'gdp_per_capita ($)', 'generation']


In [10]:
df['gdp_for_year ($)'] = df[' gdp_for_year ($) '].str.replace(',', '')
df['gdp_for_year ($)'] = df['gdp_for_year ($)'].astype(float)
# validation
print('The data types of the columns now:\n', df.dtypes)

The data types of the columns now:
 country                object
year                    int64
sex                    object
age                    object
suicides_no             int64
population              int64
suicides/100k pop     float64
country-year           object
 gdp_for_year ($)      object
gdp_per_capita ($)      int64
generation             object
gdp_for_year ($)      float64
dtype: object


In [None]:
# Ordering Age
age_order = ['5-14 years', '15-24 years', '25-34 years', '35-54 years', '55-74 years', '75+ years']
df['age'] = pd.Categorical(df['age'], categories=age_order, ordered=True)