# Initial Exploration

## Read In Data

In [23]:
import pandas as pd
df = pd.read_csv("suiciderates.csv")
df

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers
5,Albania,1987,female,75+ years,1,35600,2.81,Albania1987,,2156624900,796,G.I. Generation
6,Albania,1987,female,35-54 years,6,278800,2.15,Albania1987,,2156624900,796,Silent
7,Albania,1987,female,25-34 years,4,257200,1.56,Albania1987,,2156624900,796,Boomers
8,Albania,1987,male,55-74 years,1,137500,0.73,Albania1987,,2156624900,796,G.I. Generation
9,Albania,1987,female,5-14 years,0,311000,0.00,Albania1987,,2156624900,796,Generation X


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27820 entries, 0 to 27819
Data columns (total 12 columns):
country               27820 non-null object
year                  27820 non-null int64
sex                   27820 non-null object
age                   27820 non-null object
suicides_no           27820 non-null int64
population            27820 non-null int64
suicides/100k pop     27820 non-null float64
country-year          27820 non-null object
HDI for year          8364 non-null float64
 gdp_for_year ($)     27820 non-null object
gdp_per_capita ($)    27820 non-null int64
generation            27820 non-null object
dtypes: float64(2), int64(4), object(6)
memory usage: 2.5+ MB


### Observations:
- Age is grouped into year buckets already.
- Year goes to 2016.
- Sex should be changed to a boolean.
- HDI(Human Development Index) has a lot of missing values. Consider removing. No other columns seem to have missing values.
- gdp_for_year should probably be converted to an int.
- Generation could be made into boolean as well.

## Cleaning:

### Making Generations Boolean:

In [4]:
df_split_gen = df.copy()
df_split_gen['generation'].value_counts()

Generation X       6408
Silent             6364
Millenials         5844
Boomers            4990
G.I. Generation    2744
Generation Z       1470
Name: generation, dtype: int64

In [9]:
def gen_x_bool(gen):
    if gen == "Generation X":
        return 1
    else:
        return 0
def silent_bool(gen):
    if gen == "Silent":
        return 1
    else:
        return 0
def mil_bool(gen):
    if gen == "Millenials":
        return 1
    else:
        return 0
def boom_bool(gen):
    if gen == "Boomers":
        return 1
    else:
        return 0
def g_i_bool(gen):
    if gen == "G.I. Generation":
        return 1
    else:
        return 0
def z_bool(gen):
    if gen == "Generation Z":
        return 1
    else:
        return 0
    
df_split_gen["Generation X"] = df_split_gen["generation"].map(gen_x_bool)
print(df_split_gen["Generation X"].value_counts())

df_split_gen["Silent"] = df_split_gen["generation"].map(silent_bool)
print(df_split_gen["Silent"].value_counts())

df_split_gen["Millenials"] = df_split_gen["generation"].map(mil_bool)
print(df_split_gen["Millenials"].value_counts())

df_split_gen["Boomers"] = df_split_gen["generation"].map(boom_bool)
print(df_split_gen["Boomers"].value_counts())

df_split_gen["G.I. Generation"] = df_split_gen["generation"].map(g_i_bool)
print(df_split_gen["G.I. Generation"].value_counts())

df_split_gen["Generation Z"] = df_split_gen["generation"].map(z_bool)
print(df_split_gen["Generation Z"].value_counts())

0    21412
1     6408
Name: Generation X, dtype: int64
0    21456
1     6364
Name: Silent, dtype: int64
0    21976
1     5844
Name: Millenials, dtype: int64
0    22830
1     4990
Name: Boomers, dtype: int64
0    25076
1     2744
Name: G.I. Generation, dtype: int64
0    26350
1     1470
Name: Generation Z, dtype: int64


In [10]:
df_split_gen.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27820 entries, 0 to 27819
Data columns (total 18 columns):
country               27820 non-null object
year                  27820 non-null int64
sex                   27820 non-null object
age                   27820 non-null object
suicides_no           27820 non-null int64
population            27820 non-null int64
suicides/100k pop     27820 non-null float64
country-year          27820 non-null object
HDI for year          8364 non-null float64
 gdp_for_year ($)     27820 non-null object
gdp_per_capita ($)    27820 non-null int64
generation            27820 non-null object
Generation X          27820 non-null int64
Silent                27820 non-null int64
Millenials            27820 non-null int64
Boomers               27820 non-null int64
G.I. Generation       27820 non-null int64
Generation Z          27820 non-null int64
dtypes: float64(2), int64(10), object(6)
memory usage: 3.8+ MB


### drop the generation column

In [12]:
drop_gen = df_split_gen.copy()
drop_gen.drop(labels="generation", axis='columns', inplace=True)
drop_gen.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27820 entries, 0 to 27819
Data columns (total 17 columns):
country               27820 non-null object
year                  27820 non-null int64
sex                   27820 non-null object
age                   27820 non-null object
suicides_no           27820 non-null int64
population            27820 non-null int64
suicides/100k pop     27820 non-null float64
country-year          27820 non-null object
HDI for year          8364 non-null float64
 gdp_for_year ($)     27820 non-null object
gdp_per_capita ($)    27820 non-null int64
Generation X          27820 non-null int64
Silent                27820 non-null int64
Millenials            27820 non-null int64
Boomers               27820 non-null int64
G.I. Generation       27820 non-null int64
Generation Z          27820 non-null int64
dtypes: float64(2), int64(10), object(5)
memory usage: 3.6+ MB


### Changing sex to Boolean:

In [13]:
df_sex = drop_gen.copy()
df_sex["sex"].replace("female", 0,inplace=True)
df_sex["sex"].replace("male", 1, inplace=True)
#rename it to male for easier understanding
df_sex.rename(columns = {'sex':'male'}, inplace = True)
df_sex

Unnamed: 0,country,year,male,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),Generation X,Silent,Millenials,Boomers,G.I. Generation,Generation Z
0,Albania,1987,1,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,1,0,0,0,0,0
1,Albania,1987,1,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,0,1,0,0,0,0
2,Albania,1987,0,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,1,0,0,0,0,0
3,Albania,1987,1,75+ years,1,21800,4.59,Albania1987,,2156624900,796,0,0,0,0,1,0
4,Albania,1987,1,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,0,0,0,1,0,0
5,Albania,1987,0,75+ years,1,35600,2.81,Albania1987,,2156624900,796,0,0,0,0,1,0
6,Albania,1987,0,35-54 years,6,278800,2.15,Albania1987,,2156624900,796,0,1,0,0,0,0
7,Albania,1987,0,25-34 years,4,257200,1.56,Albania1987,,2156624900,796,0,0,0,1,0,0
8,Albania,1987,1,55-74 years,1,137500,0.73,Albania1987,,2156624900,796,0,0,0,0,1,0
9,Albania,1987,0,5-14 years,0,311000,0.00,Albania1987,,2156624900,796,1,0,0,0,0,0


### Making Age Groups Boolean:

In [14]:
age_bool = df_sex.copy()
age_bool['age'].value_counts()

35-54 years    4642
55-74 years    4642
15-24 years    4642
25-34 years    4642
75+ years      4642
5-14 years     4610
Name: age, dtype: int64

In [17]:
def g1_bool(age):
    if age == "5-14 years":
        return 1
    else:
        return 0
def g2_bool(age):
    if age == "15-24 years":
        return 1
    else:
        return 0
def g3_bool(age):
    if age == "25-34 years":
        return 1
    else:
        return 0
def g4_bool(age):
    if age == "35-54 years":
        return 1
    else:
        return 0
def g5_bool(age):
    if age == "55-74 years":
        return 1
    else:
        return 0
def g6_bool(age):
    if age == "75+ years":
        return 1
    else:
        return 0
    
age_bool["5-14 years"] = age_bool["age"].map(g1_bool)
print(age_bool["5-14 years"].value_counts())

age_bool["15-24 years"] = age_bool["age"].map(g2_bool)
print(age_bool["15-24 years"].value_counts())

age_bool["25-34 years"] = age_bool["age"].map(g3_bool)
print(age_bool["25-34 years"].value_counts())

age_bool["35-54 years"] = age_bool["age"].map(g4_bool)
print(age_bool["35-54 years"].value_counts())

age_bool["55-74 years"] = age_bool["age"].map(g5_bool)
print(age_bool["55-74 years"].value_counts())

age_bool["75+ years"] = age_bool["age"].map(g6_bool)
print(age_bool["75+ years"].value_counts())

0    23210
1     4610
Name: 5-14 years, dtype: int64
0    23178
1     4642
Name: 15-24 years, dtype: int64
0    23178
1     4642
Name: 25-34 years, dtype: int64
0    23178
1     4642
Name: 35-54 years, dtype: int64
0    23178
1     4642
Name: 55-74 years, dtype: int64
0    23178
1     4642
Name: 75+ years, dtype: int64


In [18]:
age_bool.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27820 entries, 0 to 27819
Data columns (total 23 columns):
country               27820 non-null object
year                  27820 non-null int64
male                  27820 non-null int64
age                   27820 non-null object
suicides_no           27820 non-null int64
population            27820 non-null int64
suicides/100k pop     27820 non-null float64
country-year          27820 non-null object
HDI for year          8364 non-null float64
 gdp_for_year ($)     27820 non-null object
gdp_per_capita ($)    27820 non-null int64
Generation X          27820 non-null int64
Silent                27820 non-null int64
Millenials            27820 non-null int64
Boomers               27820 non-null int64
G.I. Generation       27820 non-null int64
Generation Z          27820 non-null int64
5-14 years            27820 non-null int64
15-24 years           27820 non-null int64
25-34 years           27820 non-null int64
35-54 years           27

### Drop the age column

In [19]:
age_bool2 = age_bool.copy()
age_bool2.drop(labels="age", axis='columns', inplace=True)
age_bool2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27820 entries, 0 to 27819
Data columns (total 22 columns):
country               27820 non-null object
year                  27820 non-null int64
male                  27820 non-null int64
suicides_no           27820 non-null int64
population            27820 non-null int64
suicides/100k pop     27820 non-null float64
country-year          27820 non-null object
HDI for year          8364 non-null float64
 gdp_for_year ($)     27820 non-null object
gdp_per_capita ($)    27820 non-null int64
Generation X          27820 non-null int64
Silent                27820 non-null int64
Millenials            27820 non-null int64
Boomers               27820 non-null int64
G.I. Generation       27820 non-null int64
Generation Z          27820 non-null int64
5-14 years            27820 non-null int64
15-24 years           27820 non-null int64
25-34 years           27820 non-null int64
35-54 years           27820 non-null int64
55-74 years           278

### Drop HDI for year:

In [20]:
drop_hdi = age_bool.copy()
drop_hdi.drop(labels="HDI for year", axis='columns', inplace=True)
drop_hdi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27820 entries, 0 to 27819
Data columns (total 22 columns):
country               27820 non-null object
year                  27820 non-null int64
male                  27820 non-null int64
age                   27820 non-null object
suicides_no           27820 non-null int64
population            27820 non-null int64
suicides/100k pop     27820 non-null float64
country-year          27820 non-null object
 gdp_for_year ($)     27820 non-null object
gdp_per_capita ($)    27820 non-null int64
Generation X          27820 non-null int64
Silent                27820 non-null int64
Millenials            27820 non-null int64
Boomers               27820 non-null int64
G.I. Generation       27820 non-null int64
Generation Z          27820 non-null int64
5-14 years            27820 non-null int64
15-24 years           27820 non-null int64
25-34 years           27820 non-null int64
35-54 years           27820 non-null int64
55-74 years           278

### Convert GDP to numerical values:

In [21]:
gdp_to_num = drop_hdi.copy()

#renaming columns because I could not reference the column the way it was
#Also could not rename it like I did above without an error
# help from: https://stackoverflow.com/questions/11346283/renaming-columns-in-pandas/39770407
new_cols = gdp_to_num.columns.values
new_cols[8] = 'gdp_for_year $'
new_cols[9] = 'gdp_per_capita $'
gdp_to_num.columns = new_cols

#Change the datatype of gdp_for_year
gdp_to_num['gdp_for_year $'] = gdp_to_num.iloc[:,8].str.replace(',', '').astype(float)
# help from: https://www.shanelynn.ie/select-pandas-dataframe-rows-and-columns-using-iloc-loc-and-ix/#loc-selection
gdp_to_num.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27820 entries, 0 to 27819
Data columns (total 22 columns):
country              27820 non-null object
year                 27820 non-null int64
male                 27820 non-null int64
age                  27820 non-null object
suicides_no          27820 non-null int64
population           27820 non-null int64
suicides/100k pop    27820 non-null float64
country-year         27820 non-null object
gdp_for_year $       27820 non-null float64
gdp_per_capita $     27820 non-null int64
Generation X         27820 non-null int64
Silent               27820 non-null int64
Millenials           27820 non-null int64
Boomers              27820 non-null int64
G.I. Generation      27820 non-null int64
Generation Z         27820 non-null int64
5-14 years           27820 non-null int64
15-24 years          27820 non-null int64
25-34 years          27820 non-null int64
35-54 years          27820 non-null int64
55-74 years          27820 non-null int64
75

## Split training and test set

In [22]:
from sklearn.model_selection import train_test_split

data2 = gdp_to_num.copy()

train_set, test_set = train_test_split(data2, test_size=0.2, random_state=123)
print("traning set length:", len(train_set),"\ntest set length:", len(test_set))

traning set length: 22256 
test set length: 5564
