# Data Cleaning and Exploration

### Project Proposal / Outline

Our project is to uncover patterns in worldwide suicide rates. We will examine the relationships between number of suicides across multiple countries, years, genders, ages and several demographic and socioeconomic factors in order to understand if there are underlying factors influencing suicide rates across the world.

### Team Members
* Ayala, Enrique
* Burnes, Javier
* Guarnieri, Richard
* Macias, Erick
* Rello, Carlos

### Import Dependencies

In [336]:
# import dependencies
import pandas as pd

### Import CSV files and load into a Pandas DataFrame

In [257]:
# import suicides rates and demographic / socioeconomic UNESCO datasets
suicides_csv = pd.read_csv('Suicide_Rates_Overview_1985_to_2016/suicide_rates.csv')
unesco_csv = pd.read_csv('Demographic_and_Socioeconomic_UNESCO/DEMO_DS_29112019163028002.csv')

# create DataFrames
suicides_df = pd.DataFrame(suicides_csv)
unesco_df = pd.DataFrame(unesco_csv)

In [258]:
# display suicides_df
suicides_df.head(3)

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X


In [259]:
# display unesco_df
unesco_df.head(3)

Unnamed: 0,DEMO_IND,Indicator,LOCATION,Country,TIME,Time,Value,Flag Codes,Flags
0,SP_DYN_TFRT_IN,"Fertility rate, total (births per woman)",AUS,Australia,1970,1970,2.859,,
1,SP_DYN_TFRT_IN,"Fertility rate, total (births per woman)",AUS,Australia,1971,1971,2.961,,
2,SP_DYN_TFRT_IN,"Fertility rate, total (births per woman)",AUS,Australia,1972,1972,2.744,,


### Clean up of country names i.e. non-existing countries, counties named differently in both tables, etc.

In [260]:
# create frame for countries in suicides_df that are not in unesco_df and choose a sample of 5 rows
suicides_df.loc[~suicides_df['country'].isin(unesco_df['Country'])].sample(5)

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
26675,United Kingdom,2001,female,75+ years,106,2801024,3.78,United Kingdom2001,,1621510004318,29179,Silent
26997,United States,1997,female,35-54 years,2758,39291000,7.02,United States1997,,8608515000000,34644,Boomers
26928,United States,1991,female,25-34 years,1153,21450000,5.38,United States1991,,6174043000000,26503,Boomers
27080,United States,2004,male,15-24 years,3596,21438383,16.77,United States2004,,12274928000000,44867,Millenials
26804,United Kingdom,2012,male,15-24 years,364,4204765,8.66,United Kingdom2012,0.901,2662085168499,44585,Millenials


In [261]:
# see countries in suicides_df that are not in unesco_df
suicides_df.loc[~suicides_df['country'].isin(unesco_df['Country'])]['country'].unique()

array(['Czech Republic', 'Macau', 'Saint Vincent and Grenadines',
       'United Kingdom', 'United States'], dtype=object)

#### Clean up of Czech Republic

In [262]:
# search for countries in unesco_df in case country exists but is typed differently
unesco_df[(unesco_df['Country'].str.contains('Cze'))]['Country'].unique()

array(['Czechia'], dtype=object)

In [263]:
# replace country name in unesco_df
unesco_df['Country'].replace({
    'Czechia': 'Czech Republic',
}, inplace=True)

# re-run to see countries in suicides_df that are not in unesco_df
suicides_df.loc[~suicides_df['country'].isin(unesco_df['Country'])]['country'].unique()

array(['Macau', 'Saint Vincent and Grenadines', 'United Kingdom',
       'United States'], dtype=object)

#### Clean up of Macau

In [264]:
# search for countries in unesco_df in case country exists but is typed differently
unesco_df[(unesco_df['Country'].str.contains('Mac'))]['Country'].unique()

array(['North Macedonia', 'China, Macao Special Administrative Region'],
      dtype=object)

In [265]:
# replace country name in unesco_df
unesco_df['Country'].replace({
    'China, Macao Special Administrative Region': 'Macau',
}, inplace=True)

# re-run to see countries in suicides_df that are not in unesco_df
suicides_df.loc[~suicides_df['country'].isin(unesco_df['Country'])]['country'].unique()

array(['Saint Vincent and Grenadines', 'United Kingdom', 'United States'],
      dtype=object)

#### Clean up of Saint Vincent and Grenadines

In [266]:
# search for countries in unesco_df in case country exists but is typed differently
unesco_df[(unesco_df['Country'].str.contains('Saint'))]['Country'].unique()

array(['Saint Vincent and the Grenadines', 'Saint-Martin (French part)',
       'Saint Kitts and Nevis', 'Saint Lucia', 'Saint Helena',
       'Saint Pierre and Miquelon', 'Saint-Barthélemy'], dtype=object)

In [267]:
# replace country name in unesco_df
unesco_df['Country'].replace({
    'Saint Vincent and the Grenadines': 'Saint Vincent and Grenadines',
}, inplace=True)

# re-run to see countries in suicides_df that are not in unesco_df
suicides_df.loc[~suicides_df['country'].isin(unesco_df['Country'])]['country'].unique()

array(['United Kingdom', 'United States'], dtype=object)

#### Clean up of United Kingdom and United States

In [268]:
# search for countries in unesco_df in case country exists but is typed differently
unesco_df[(unesco_df['Country'].str.contains('United'))]['Country'].unique()

array(['United States Virgin Islands', 'United Republic of Tanzania',
       'United Arab Emirates', 'United States of America',
       'United Kingdom of Great Britain and Northern Ireland'],
      dtype=object)

In [269]:
# replace country name in unesco_df
unesco_df['Country'].replace({
    'United States of America': 'United States',
    'United Kingdom of Great Britain and Northern Ireland': 'United Kingdom'
}, inplace=True)

# re-run to see countries in suicides_df that are not in unesco_df
suicides_df.loc[~suicides_df['country'].isin(unesco_df['Country'])]['country'].unique()

array([], dtype=object)

### Clean up of UNESCO DataFrame

In [270]:
# display unesco_df
unesco_df.head()

Unnamed: 0,DEMO_IND,Indicator,LOCATION,Country,TIME,Time,Value,Flag Codes,Flags
0,SP_DYN_TFRT_IN,"Fertility rate, total (births per woman)",AUS,Australia,1970,1970,2.859,,
1,SP_DYN_TFRT_IN,"Fertility rate, total (births per woman)",AUS,Australia,1971,1971,2.961,,
2,SP_DYN_TFRT_IN,"Fertility rate, total (births per woman)",AUS,Australia,1972,1972,2.744,,
3,SP_DYN_TFRT_IN,"Fertility rate, total (births per woman)",AUS,Australia,1973,1973,2.491,,
4,SP_DYN_TFRT_IN,"Fertility rate, total (births per woman)",AUS,Australia,1974,1974,2.397,,


#### create country-year column to allow it to merge with suicides_df

In [271]:
# explore dtypes
unesco_df.dtypes

DEMO_IND       object
Indicator      object
LOCATION       object
Country        object
TIME            int64
Time            int64
Value         float64
Flag Codes     object
Flags          object
dtype: object

In [272]:
# change dtype from int64 to str to allow concatenation of Country and Time
unesco_df['Time'] = unesco_df['Time'].astype(str)
unesco_df.dtypes

DEMO_IND       object
Indicator      object
LOCATION       object
Country        object
TIME            int64
Time           object
Value         float64
Flag Codes     object
Flags          object
dtype: object

In [273]:
# create country-year column
unesco_df['country-year'] = unesco_df['Country'] + unesco_df['Time']

In [274]:
# pivot columns
unesco_df = unesco_df.pivot(index='country-year', columns='Indicator', values='Value').reset_index()
unesco_df.head(3)

Indicator,country-year,DEC alternative conversion factor (LCU per US$),"Fertility rate, total (births per woman)",GDP (constant LCU),GDP (current LCU),GDP (current US$),GDP at market prices (constant 2010 US$),GDP deflator (base year varies by country),GDP growth (annual %),GDP per capita (current LCU),...,Population aged 15-24 years,Population aged 25-64 years,Population aged 65 years or older,Population growth (annual %),Poverty headcount ratio at $1.90 a day (PPP) (% of population),"Prevalence of HIV, total (% of population ages 15-49)",Price level ratio of PPP conversion factor (GDP) to market exchange rate,Rural population (% of total population),Total debt service (% of GNI),Total population
0,Afghanistan1970,45.0,7.45,,78699900000.0,1748887000.0,,,,7073.43401,...,2087.213,3839.638,294.047,2.47227,,,,88.357,,11173.642
1,Afghanistan1971,45.0,7.45,,82399900000.0,1831109000.0,,,,7216.7776,...,2128.966,3917.821,302.405,2.588,,,,87.979,,11475.445
2,Afghanistan1972,45.0,7.45,,71800000000.0,1595555000.0,,,,6125.26565,...,2177.263,4005.496,309.809,2.62866,,,,87.59,,11791.215


### Merge Suicides and UNESCO DataFrames

In [328]:
# merge DataFrames
merged_df = pd.merge(suicides_df, unesco_df, on='country-year', how='left')
merged_df.head(3)

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),...,Population aged 15-24 years,Population aged 25-64 years,Population aged 65 years or older,Population growth (annual %),Poverty headcount ratio at $1.90 a day (PPP) (% of population),"Prevalence of HIV, total (% of population ages 15-49)",Price level ratio of PPP conversion factor (GDP) to market exchange rate,Rural population (% of total population),Total debt service (% of GNI),Total population
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,...,620.496,1287.052,169.096,1.99704,,,,64.381,,3124.894
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,...,620.496,1287.052,169.096,1.99704,,,,64.381,,3124.894
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,...,620.496,1287.052,169.096,1.99704,,,,64.381,,3124.894


In [329]:
# show columns
merged_df.columns

Index(['country', 'year', 'sex', 'age', 'suicides_no', 'population',
       'suicides/100k pop', 'country-year', 'HDI for year',
       ' gdp_for_year ($) ', 'gdp_per_capita ($)', 'generation',
       'DEC alternative conversion factor (LCU per US$)',
       'Fertility rate, total (births per woman)', 'GDP (constant LCU)',
       'GDP (current LCU)', 'GDP (current US$)',
       'GDP at market prices (constant 2010 US$)',
       'GDP deflator (base year varies by country)', 'GDP growth (annual %)',
       'GDP per capita (current LCU)', 'GDP per capita (current US$)',
       'GDP per capita, PPP (constant 2011 international $)',
       'GDP per capita, PPP (current international $)',
       'GDP, PPP (constant 2011 international $)',
       'GDP, PPP (current international $)', 'GNI (current LCU)',
       'GNI per capita (current LCU)',
       'GNI per capita, Atlas method (current US$)',
       'GNI per capita, PPP (current international $)',
       'General government total expenditur

In [330]:
# rename columns
summary_df = merged_df.rename(columns={
    'suicides_no': 'suicides',
    'suicides/100k pop': 'suicides/100k',
    'HDI for year': 'HDI',
    'Fertility rate, total (births per woman)': 'fertility_rate',
    'GDP (constant LCU)': 'GDP_cons_LCU',
    'GDP (current LCU)': 'GDP_curr_LCU',
    'GDP (current US$)': 'GDP_curr_US$',
    'suicides/100k pop': 'suicides/100k_pop(%)',
    'GDP growth (annual %)  ': 'GDP_annual_growth(%)',
    'GDP per capita (current LCU)': 'GDP_pc_curr_LCU',
    'GDP per capita (current US$)': 'GDP_pc_curr_US$',
    'GDP per capita, PPP (constant 2011 international $)': 'GDP_pc_PPP_constant_2011_intl_$',
    'GDP per capita, PPP (current international $)': 'GDP_pc_PPP_current_intl_$',
    'GDP, PPP (constant 2011 international $)': 'GDP_PPP_cons_2011_intl_$',
    'GDP, PPP (current international $)': 'GDP_PPP_curr_intl_$',
    'GNI (current LCU)': 'GNI_curr_LCU',
    'GNI per capita (current LCU)': 'GNI_pc_curr_LCU',
    'GNI per capita, Atlas method (current US$)': 'GNI_pc_Atlas_curr_US$',
    'GNI per capita, PPP (current international $)': 'GNI_pc_PPP_curr_intl_$',
    'General government total expenditure (current LCU)': 'govt_total_exp_curr_LCU',
    'Life expectancy at birth, total (years)': 'life_expectancy',
    'Mortality rate, infant (per 1,000 live births)': 'mortality_infant_rate/1k_births',
    'Poverty headcount ratio at $1.90 a day (PPP) (% of population)': 'poverty_$1.90/day(%)',
    'Prevalence of HIV, total (% of population ages 15-49)': 'HIV_rate(%)',
    'Rural population (% of total population)': 'rural_pop(%)',
    'Total debt service (% of GNI)': 'debt_service(%_GNI)',  
})

In [331]:
# drop non-useful columns
summary_df.drop(columns=['country-year',
                        ' gdp_for_year ($) ',
                        'gdp_per_capita ($)',
                        'DEC alternative conversion factor (LCU per US$)',
                        'GDP at market prices (constant 2010 US$)',
                        'GDP deflator (base year varies by country)',
                        'Official exchange rate (LCU per US$, period average)',
                        'PPP conversion factor, GDP (LCU per international $)',
                        'PPP conversion factor, private consumption (LCU per international $)',
                        'Population aged 14 years or younger ',
                        'Population aged 15-24 years ',
                        'Population aged 25-64 years ',
                        'Population aged 65 years or older ',
                        'Population growth (annual %)',
                        'Price level ratio of PPP conversion factor (GDP) to market exchange rate',
                        'Total population ',
                        ], inplace=True)
summary_df.head()

Unnamed: 0,country,year,sex,age,suicides,population,suicides/100k_pop(%),HDI,generation,fertility_rate,...,GNI_pc_curr_LCU,GNI_pc_Atlas_curr_US$,GNI_pc_PPP_curr_intl_$,govt_total_exp_curr_LCU,life_expectancy,mortality_infant_rate/1k_births,poverty_$1.90/day(%),HIV_rate(%),rural_pop(%),debt_service(%_GNI)
0,Albania,1987,male,15-24 years,21,312900,6.71,,Generation X,3.164,...,5399.38481,730.0,,,71.76,40.5,,,64.381,
1,Albania,1987,male,35-54 years,16,308000,5.19,,Silent,3.164,...,5399.38481,730.0,,,71.76,40.5,,,64.381,
2,Albania,1987,female,15-24 years,14,289700,4.83,,Generation X,3.164,...,5399.38481,730.0,,,71.76,40.5,,,64.381,
3,Albania,1987,male,75+ years,1,21800,4.59,,G.I. Generation,3.164,...,5399.38481,730.0,,,71.76,40.5,,,64.381,
4,Albania,1987,male,25-34 years,9,274300,3.28,,Boomers,3.164,...,5399.38481,730.0,,,71.76,40.5,,,64.381,


In [332]:
# re-order columns
cols = summary_df.columns.tolist()
cols = cols[:6] + list([cols[8]]) + list([cols[7]]) + list([cols[9]]) + cols[-6:] + cols[10:-7]

In [333]:
#display final summary_df
summary_df = summary_df[cols]
summary_df

Unnamed: 0,country,year,sex,age,suicides,population,generation,HDI,fertility_rate,life_expectancy,...,GDP_pc_curr_LCU,GDP_pc_curr_US$,GDP_pc_PPP_constant_2011_intl_$,GDP_pc_PPP_current_intl_$,GDP_PPP_cons_2011_intl_$,GDP_PPP_curr_intl_$,GNI_curr_LCU,GNI_pc_curr_LCU,GNI_pc_Atlas_curr_US$,GNI_pc_PPP_curr_intl_$
0,Albania,1987,male,15-24 years,21,312900,Generation X,,3.164,71.760,...,5.398347e+03,674.79338,,,,,1.664957e+10,5.399385e+03,730.0,
1,Albania,1987,male,35-54 years,16,308000,Silent,,3.164,71.760,...,5.398347e+03,674.79338,,,,,1.664957e+10,5.399385e+03,730.0,
2,Albania,1987,female,15-24 years,14,289700,Generation X,,3.164,71.760,...,5.398347e+03,674.79338,,,,,1.664957e+10,5.399385e+03,730.0,
3,Albania,1987,male,75+ years,1,21800,G.I. Generation,,3.164,71.760,...,5.398347e+03,674.79338,,,,,1.664957e+10,5.399385e+03,730.0,
4,Albania,1987,male,25-34 years,9,274300,Boomers,,3.164,71.760,...,5.398347e+03,674.79338,,,,,1.664957e+10,5.399385e+03,730.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27815,Uzbekistan,2014,female,35-54 years,107,3620833,Generation X,0.675,2.457,71.039,...,4.741785e+06,2050.44841,5370.8658,5675.30962,1.651955e+11,1.745595e+11,1.526090e+14,4.961653e+06,2110.0,5940.0
27816,Uzbekistan,2014,female,75+ years,9,348465,Silent,0.675,2.457,71.039,...,4.741785e+06,2050.44841,5370.8658,5675.30962,1.651955e+11,1.745595e+11,1.526090e+14,4.961653e+06,2110.0,5940.0
27817,Uzbekistan,2014,male,5-14 years,60,2762158,Generation Z,0.675,2.457,71.039,...,4.741785e+06,2050.44841,5370.8658,5675.30962,1.651955e+11,1.745595e+11,1.526090e+14,4.961653e+06,2110.0,5940.0
27818,Uzbekistan,2014,female,5-14 years,44,2631600,Generation Z,0.675,2.457,71.039,...,4.741785e+06,2050.44841,5370.8658,5675.30962,1.651955e+11,1.745595e+11,1.526090e+14,4.961653e+06,2110.0,5940.0


### Save summary_df to use in Data Analysis NB

In [337]:
# save summary_df
summary_df.to_csv('cleaned_data.csv', index=False)