In [0]:
from google.colab import drive
drive.mount('/content/gdrive/')

In [0]:
import os
os.listdir('/content/gdrive/')

['Team Drives', 'My Drive', '.Trash']

In [0]:
data_dir = os.path.join('/content/gdrive', 'My Drive', 'workspace', 'colab_notebooks', 'intro_colab')

Now, lets look at the data. This is data from World Health Organization and you can download it [here](https://drive.google.com/open?id=1HXNRVK9nnxBFcXdRxcprOzJoDtAqdRK0)


In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import PIL
import numpy as np 
import seaborn as sns

In [0]:
!pip install -q keras
import keras

Using TensorFlow backend.


In [0]:
keras

In [0]:
os.listdir(data_dir)

['master.csv', 'Rad_Panda.ipynb']

In [0]:
m_data = pd.read_csv(os.path.join(data_dir, 'master.csv'))

In [0]:
m_data.head()

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers


In [0]:
#show data at the bottom
m_data.tail()

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
27815,Uzbekistan,2014,female,35-54 years,107,3620833,2.96,Uzbekistan2014,0.675,63067077179,2309,Generation X
27816,Uzbekistan,2014,female,75+ years,9,348465,2.58,Uzbekistan2014,0.675,63067077179,2309,Silent
27817,Uzbekistan,2014,male,5-14 years,60,2762158,2.17,Uzbekistan2014,0.675,63067077179,2309,Generation Z
27818,Uzbekistan,2014,female,5-14 years,44,2631600,1.67,Uzbekistan2014,0.675,63067077179,2309,Generation Z
27819,Uzbekistan,2014,female,55-74 years,21,1438935,1.46,Uzbekistan2014,0.675,63067077179,2309,Boomers


In [0]:
#random sample of data, shows that some columns have HDI 
m_data.sample(15)

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
9969,Germany,2011,female,35-54 years,757,12196847,6.21,Germany2011,0.911,3757698281118,47947,Generation X
23061,Slovenia,1997,male,55-74 years,138,170200,81.08,Slovenia1997,,20749140606,11014,Silent
4581,Bulgaria,1988,female,75+ years,102,241000,42.32,Bulgaria1988,,22555941176,2688,G.I. Generation
7607,Denmark,2009,female,15-24 years,4,324876,1.23,Denmark2009,,321241396034,61863,Millenials
7919,Ecuador,2003,female,5-14 years,20,1442546,1.39,Ecuador2003,,32432858000,2755,Millenials
17595,New Zealand,1997,male,35-54 years,120,508800,23.58,New Zealand1997,,66074513018,19041,Boomers
6439,Croatia,2000,female,5-14 years,0,295200,0.0,Croatia2000,0.749,21774273832,5288,Millenials
812,Argentina,2003,female,35-54 years,196,4402438,4.45,Argentina2003,,127586973492,3668,Boomers
27778,Uzbekistan,2011,male,75+ years,10,204956,4.88,Uzbekistan2011,0.661,45915191189,1767,Silent
7969,Ecuador,2007,female,75+ years,2,180159,1.11,Ecuador2007,,51007777000,4024,Silent


In [0]:
#The describe function in pandas is where you should start
m_data.describe()

Unnamed: 0,year,suicides_no,population,suicides/100k pop,HDI for year,gdp_per_capita ($)
count,27820.0,27820.0,27820.0,27820.0,8364.0,27820.0
mean,2001.258375,242.574407,1844794.0,12.816097,0.776601,16866.464414
std,8.469055,902.047917,3911779.0,18.961511,0.093367,18887.576472
min,1985.0,0.0,278.0,0.0,0.483,251.0
25%,1995.0,3.0,97498.5,0.92,0.713,3447.0
50%,2002.0,25.0,430150.0,5.99,0.779,9372.0
75%,2008.0,131.0,1486143.0,16.62,0.855,24874.0
max,2016.0,22338.0,43805210.0,224.97,0.944,126352.0


In [0]:
m_data.iloc[:,1:5].describe()

Unnamed: 0,year,suicides_no
count,27820.0,27820.0
mean,2001.258375,242.574407
std,8.469055,902.047917
min,1985.0,0.0
25%,1995.0,3.0
50%,2002.0,25.0
75%,2008.0,131.0
max,2016.0,22338.0


In [0]:
m_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27820 entries, 0 to 27819
Data columns (total 12 columns):
country               27820 non-null object
year                  27820 non-null int64
sex                   27820 non-null object
age                   27820 non-null object
suicides_no           27820 non-null int64
population            27820 non-null int64
suicides/100k pop     27820 non-null float64
country-year          27820 non-null object
HDI for year          8364 non-null float64
 gdp_for_year ($)     27820 non-null object
gdp_per_capita ($)    27820 non-null int64
generation            27820 non-null object
dtypes: float64(2), int64(4), object(6)
memory usage: 2.5+ MB


In [0]:
m_data.columns

Index(['country', 'year', 'sex', 'age', 'suicides_no', 'population',
       'suicides/100k pop', 'country-year', 'HDI for year',
       ' gdp_for_year ($) ', 'gdp_per_capita ($)', 'generation'],
      dtype='object')

In [0]:
m_data = m_data.rename(columns={'country':'Country','year':'Year','sex':'Gender','age':'Age','suicides_no':'SuicidesNo','population':'Population','suicides/100k pop':'Suicides100kPop','country-year':'CountryYear','HDI for year':'HDIForYear',' gdp_for_year ($) ':'GdpForYearMoney','gdp_per_capita ($)':'GdpPerCapitalMoney','generation':'Generation'})

In [0]:
m_data.head()

Unnamed: 0,Country,Year,Gender,Age,SuicidesNo,Population,Suicides100kPop,CountryYear,HDIForYear,GdpForYearMoney,GdpPerCapitalMoney,Generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers


In [0]:
#We saw earlier that HDIForYear column had lots of isnull values, so lets explore that
m_data.isnull().sum()

Country                   0
Year                      0
Gender                    0
Age                       0
SuicidesNo                0
Population                0
Suicides100kPop           0
CountryYear               0
HDIForYear            19456
GdpForYearMoney           0
GdpPerCapitalMoney        0
Generation                0
dtype: int64

In [0]:
#The countryyear column is simply a concat of columns 1 and 2, and the HDIforyear seems unsualbe due to the large number of NaNs, so lets drop them
m_data.drop(['CountryYear', 'HDIForYear'], axis=1)

In [0]:
#Find out what is the min and max value for Year

#select and plot data by country, by gender for the most recent  year, 
len(m_data[(m_data['Year']==1985)].Country.unique())

48

In [0]:
years = m_data['Year'].unique()
years

array([1987, 1988, 1989, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
       2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
       1985, 1986, 1990, 1991, 2012, 2013, 2014, 2015, 2011, 2016])

In [0]:
unique_year_count = []
for year in years:
  unique_year_count.append(len(m_data[m_data['Year']==year]['Country'].unique()))
print(unique_year_count)

[54, 49, 52, 65, 65, 68, 78, 77, 77, 79, 83, 86, 88, 86, 86, 84, 84, 85, 86, 85, 89, 88, 48, 48, 64, 64, 81, 80, 78, 62, 86, 16]


In [0]:
len(m_data[m_data['Year']==year]['Country'].unique())

In [0]:
year

In [0]:
#chart it out 
plt.figure(figsize=(10, 10))
sns.barplot( x=years, y=unique_year_count)
plt.show()

In [0]:
#ratio of men vs. women who committed suicide in the countries we have data about in 1985 

#first how many countries we have data of
countries = m_data['Country'].unique()
countries
years = m_data['Year'].unique()
years

array([1987, 1988, 1989, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
       2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
       1985, 1986, 1990, 1991, 2012, 2013, 2014, 2015, 2011, 2016])

In [0]:
countries

In [0]:
#select the suicide rate corresponding to each country
m_data_for_this_year = m_data[m_data['Year']==1985]
  

In [0]:
m_countries = m_data_for_this_year['Country'].unique()

In [3]:
# load an example dataset
from vega_datasets import data
cars = data.cars()

# plot the dataset, referencing dataframe column names
import altair as alt
alt.Chart(cars).mark_bar().encode(
  x=alt.X('Miles_per_Gallon', bin=True),
  y='count()',
)

In [0]:
male_suicide_rate_by_country = []
female_suicide_rate_by_country = []
for country in m_countries:
  male_suicide_rate_by_country.append()

In [0]:
from vega_datasets import data
stocks = data.stocks()

import altair as alt
alt.Chart(stocks).mark_line().encode(
  x='date:T',
  y='price',
  color='symbol'
).interactive(bind_y=False)

In [8]:
from vega_datasets import data
stocks = data.stocks()

import altair as alt
alt.Chart(stocks).mark_line().encode(
  x='date:T',
  y='price',
  color='symbol'
).interactive(bind_y=False)

In [0]:
m_data_year_country = m_data_for_this_year[m_data_for_this_year['Country']== 'Mexico']

In [0]:
m_data_year_country_gender = m_data_year_country[m_data_year_country['Gender']=='male']

In [0]:
np.sum(m_data_year_country_gender['SuicidesNo'])

1279

In [0]:
m_data_year_country_gender = m_data_year_country[m_data_year_country['Gender']=='female']

In [0]:
np.sum(m_data_year_country_gender['SuicidesNo'])

265

In [0]:
#putting it all together 
years = m_data['Year'].unique()
years

array([1987, 1988, 1989, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
       2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
       1985, 1986, 1990, 1991, 2012, 2013, 2014, 2015, 2011, 2016])

In [0]:
years.sort()

In [0]:
years

array([1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995,
       1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
       2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016])

In [0]:
years = years[-8:]

In [0]:
years

array([2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016])

In [0]:
results_dict = {}
for year in years:
  results_dict[year] = {}
  #unique countreis for this year
  df_year = m_data[m_data['Year']==year]
  countries = np.sort(df_year['Country'].unique())
  
  for country in countries:
    results_dict[year][country] = []
    df_year_country = df_year[df_year['Country'] == country]
    #append female at position 1
    df_year_country_fm  = df_year_country[df_year_country['Gender']=='female']
    df_year_country_m = df_year_country[df_year_country['Gender']=='male']
    results_dict[year][country].append(np.sum(df_year_country_fm['SuicidesNo']))
    results_dict[year][country].append(np.sum(df_year_country_m['SuicidesNo']))
    
  
  

In [0]:
results_dict

In [0]:
#plot for 2009 
plot_data = results_dict[2016]
plot_data

In [0]:
ys = list(plot_data.values())