In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import matplotlib.pyplot as plt
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# 1. Let's take a look at this data
We can see what kind of cleaning we'll need to do, by looking at some of the entries, looking at descriptive statistics, and missing entries.

In [None]:
location = "../input/suicide-rates-overview-1985-to-2016/master.csv"
suicides = pd.read_csv(location)

print("FIRST FIVE ENTRIES:")
print(suicides.head())

print("\n\n\n\n\n\nBASIC DESCRIPTIVE STATISTICS:")
print(suicides.describe())

print("\n\n\n\n\n\nNUMBER OF ENTRIES:")
print(suicides.info())
print(suicides.columns)

## Anomalies
### It looks like gdp_for_year is of type object, when it should be an int. 
Also, the same column name has some extra spaces. Let's fix those and reprint the information first.

### Just looking at Albania for 1987, it appears that there are several different population values, ranging from about 22000 to 313000, just in the first five entries
I find it hard to believe that in one year, Albania's population increased (or decreased) by a factor of 15. Additionally, the male/female and age categories seem to suggest that each entry is a single suicide. Or, perhaps more likely, each entry is a unique combination of country, year, sex, and age, with a recording of the number of suicides in that demographic. This interpretation could also account for the population column, as being the population of that demographic group. I'm confident in this, but want to verify

### The mean and median for the gdp measures, suicide number, suicide rate, and population differ quite a lot.
Let's look at some histograms for these. It's possible that the difference results from a poisson-type distribution, where higher values are exponentially less likely.

### I'll want to look at the number of unique values in each column, to get a sense for the amount of precision each column has.

### It looks like the only entry that is missing any values is HDI for year
HDI is the Human Development Indicator, an overall score for things such as trust in the government, low levels of corruption, and freedom of citizens.
A quick calculation shows that 19456 entries are missing, meaning that about 30% of all entries have an HDI value. I want to look at one more descriptive statistic before coming back to this.

In [None]:
######## FIXING gdp_for_year ##########

suicides = suicides.rename(columns ={" gdp_for_year ($) ": "gdp_for_year ($)"})
suicides["gdp_for_year ($)"] = suicides["gdp_for_year ($)"].apply(lambda x: int(x.replace(',' , '')))

print("FIRST FIVE ENTRIES:")
print(suicides.head())

print("\n\n\n\n\n\nBASIC DESCRIPTIVE STATISTICS:")
print(suicides.describe())

print("\n\n\n\n\n\nNUMBER OF ENTRIES:")
print(suicides.info())

### How are the entries to be interpreted?
Are they individual suicide entries (set seems too small for that), or are they demographic groups which are consistent across all countries and years? The latter seems much more likely and practical.

In [None]:
alb1987 = suicides[[i&j for i,j in zip(suicides["country"] == "Albania", suicides["year"] == 1987)]]

print(sum(alb1987["population"]))
print(alb1987)

Because the suicides_no column contains duplicates, it means that it is not an identifying number, and probably therefore each entry is not a single suicide, but rather a demographic group. Summing all entries in the population column similarly adds to the approximate population of Albania at the time [[Wikipedia](https://en.wikipedia.org/wiki/Demographics_of_Albania)]. It looks like GDP and HDI are both measured only once a year, and so these values are duplicated 12 (number of age groups times number of sexes) times.

### How are the gdp measures, suicide number, suicide rate, and population distributed among the different demographics, giving the mean-median differences seen above? May want to do a similar plot, averaging over country and/or year.
Let's get some histograms to view the distribution for each of these columns.

In [None]:
fig, (ax_gdp_pc, ax_gdp_yr, ax_sn, ax_sr, ax_pop) = plt.subplots(figsize=(20,10), nrows=5)
sns.set(style="darkgrid")
ln=True

sns.distplot(a=suicides["suicides_no"], ax=ax_sn, hist_kws = {"log":ln})
sns.distplot(a=suicides["suicides/100k pop"], ax=ax_sr, hist_kws = {"log":ln})
sns.distplot(a=suicides["population"], ax=ax_pop, hist_kws = {"log":ln})
sns.distplot(a=suicides["gdp_for_year ($)"], ax=ax_gdp_yr, hist_kws = {"log":ln})
sns.distplot(a=suicides["gdp_per_capita ($)"], ax=ax_gdp_pc, hist_kws = {"log":ln})
plt.title("")
plt.xticks(rotation=45)

### Fortunately, there's nothing too abnormal in these distributions, just the observation that fewer and fewer have higher populations and gdp
These are probably the drivers of the suicide numbers and rates. To be investigated.

---
### How many unique values am I dealing with? 
This will give some indicators about the distribution, which columns have the most information about and how complete each is.

In [None]:
for col in suicides.columns:
    print(f"{col}: {len(suicides[col].unique())}")

### Notes on unique values
* countries: if there are an estimated 180-200 countries in the world, about half of them have recorded data on suicides. Are any regions underrepresented?
* year: no anomalies, but it might be worth checking to see if the years are continuous or if there are any gaps
* country-year: The number of unique countries (101) times the number of unique years (32) suggests that this should be 3232 instead of 2321. It looks like there are duplicates,  since this is also lower than the number of entries in total but none are missing. Perhaps more likely is that the entire entry for these is missing.
* HDI for year: this number is very low compared to most other numerical measures. The number of unique values may also depend on how it's being measured. Might be worth investigating.

---

### It looks like the only column that is missing any values is HDI for year. Are there any patterns in the missing HDI values? By country? By year?

HDI = Human Development Indicator

A quick calculation shows that 19456 entries are missing, meaning that about 30% of all entries have an HDI value. I want to look at one more descriptive statistic before coming back to this.

In [None]:
missing_hdi = suicides[suicides["HDI for year"].isnull()]
print("MISSING HDI HEAD:")
print(missing_hdi.head())

print("\n\n\n\nBY COUNTRY:")
print(len(missing_hdi["country"].unique()))

print("\n\n\n\nBY YEAR")
print(len(missing_hdi["year"].unique()))

print("\n\n\n\nBY COUNTRY-YEAR")
print(len(missing_hdi["country-year"].unique()))

### 100 out of 101 countries and all years are represented, but country-year is notably smaller. 
This may suggest that some countries started tracking their HDIs later than others. I'd like to see a line plot of that: HDI for year by year, per country. But first, a simpler visualization is to count the number of non-absent entries per year. We'll need to divide by twelve, to account for the duplication by age and sex.

In [None]:
b = []
for yr in pd.unique(suicides["year"]):
    a = suicides[suicides["year"] == yr]
    #print(a["HDI for year"])
    b.append(len(a["HDI for year"].dropna())/12)
p = sns.barplot(x=pd.unique(suicides["year"]), y=b)
plt.xticks(rotation=60)
plt.show()

This looks like countries typically record their HDI in five year increments, and gradually more have been participating up until 2010, where there were records being taken every year, and those numbers were declining. We can do the same barplot by country to see which countries have been best about recording their HDIs

In [None]:
b = []
for ct in pd.unique(suicides["country"]):
    a = suicides[suicides["country"] == ct]
    #print(a["HDI for year"])
    b.append(len(a["HDI for year"].dropna())/12)

fig, ax = plt.subplots(figsize=(23,6))
ax = sns.barplot(x=pd.unique(suicides["country"]), y=b)
plt.xticks(rotation=60)
fig.show()

So, here we can see that no country has more than 10 entries, and many have substantially fewer. Also note the countries that are represented, and those that aren't - Africa in particular is not well represented here.

---
Below, we plot all recorded countries' HDI for all years. It does show a general steady upward trend, and gives some indicator about when given countries started and stopped (sans names, though)

In [None]:
p = sns.lineplot(x="year", y="HDI for year", data=suicides, hue="country")
p.legend_.remove()
plt.show()

### Many countries have only started tracking HDIs since 2010
Several countries started tracking HDI in 1990 and following years, and a few stopped tracking them at later dates

# 2. Questions of Interest

1. `[x]` Which country has the highest rate of suicides over this period? In the last 5 years?
2. `[x]` How has suicide rate changed over time for some countries? Globally?
3. `[x]` How does it vary between sexes? 
4. `[x]` What is the relation between suicide rate and gdp?
5. `[x]` Does suicide rate change noticeably with population?
6. `[ ]` How does it vary between generations?
7. `[ ]` How is suicide rate affected by HDI? the tracking of HDI?
8. `[ ]` How does the tracking of suicide rates relate to population/gdp?


---
## 2.1. Which countries have the highest average suicide rates?

In [None]:
rate = suicides.groupby(by=["country"]).mean()
rate = rate.sort_values(by=["suicides/100k pop"], ascending=False)
fig = plt.figure(figsize=(20,17))
sns.barplot(y=rate.index, x=rate["suicides/100k pop"])
plt.title("suicide rate by country")
fig.show()
#plt.xticks(rotation=45)

It looks like, averaging from 1985 to 2016, Lithuania has decidedly had the highest rate of suicides, at just over 40 per 100,000 people. Sri Lanka and the Republic of Korea aree the next highest, at about 35 / 100k. 

## 2.2. How have suicide rates changed over time?

In [None]:
time = suicides.groupby(by="year").mean()
plt.figure(figsize=(10,5))
sns.barplot(x=time.index, y=time["suicides/100k pop"])
plt.title("showing the relation between year and no of suicides cases")
plt.xticks(rotation=45)

Globally, there was a sharp peak in 1995, with an average of over 15 suicides/100k people. It has steadily been declining since then, except for a final peak in 2016, the biggest spike seen in this entire dataset. I'm partly surprised to see that there is no major influence as a result of the 2008 stock crash. What happened in 2016 that could have caused this giant spike? Suicides increased by more than 2/100k, or 22% globally between 2015 and 2016. US specific - if the spike is particularly pronounced in the US and US affiliated countries, the global spike may be connected to the election of Trump. If instead it is similar worldwide, it may be a combination of effects that are difficult to discern

In [None]:
# choose the years to compare
after_year = 2016
before_year = 2015
    
# set up data sets to compare
recent = suicides[suicides["year"] == after_year]
recent = recent.groupby(by="country").sum()
recent = recent.sort_values(by=["suicides/100k pop"], ascending=False)

old = suicides[suicides["year"] == before_year]
old = old.groupby(by="country").sum()
old = old.sort_values(by=["suicides/100k pop"], ascending=False)


# create plot of older and recent data sets
fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True, figsize = (30,20))
sns.barplot(y=old.index, x=old["suicides/100k pop"], ax = ax1)
sns.barplot(y=recent.index, x=recent["suicides/100k pop"], ax = ax2)

# make sure they use the same scale for easy comparison
mx = max(max(recent["suicides/100k pop"]), max(old["suicides/100k pop"]))
ax1.set_xlim(0,1.1*mx)
ax2.set_xlim(0,1.1*mx)

ax1.set_title(f"Before {before_year}")
ax2.set_title(f"After {after_year}")

ax1.invert_xaxis()
ax1.yaxis.tick_right()
ax2.yaxis.tick_left()
fig.show()

Playing with this plot can compare any two time periods. When looking at 2015 - 2016, there appears to be a sharp decrease in Hungary and everything below Croatia - precisely the opposite of what we see in the previous plot. Perhaps its to do with the weighting of the values by the demographic population.

An interesting observation here (comparing averages before 1990 to after 2011) is that after 2011, the countries at the low end are higher than they were before 1990 (maybe to do with the recording of data?), but the countries at the high end are not as high as they were before 1990.

It appears that the US is not included in those years, making this much more difficult.

In [None]:
# choose the years to compare
after_year = 2016
before_year = 2015
    
# set up data sets to compare
recent = suicides[suicides["year"] == after_year]
weighted_rate = pd.Series(index=recent.index, name="weighted_suicide_rate")
for country in pd.unique(recent["country"]):
    idx = recent[recent["country"] == country].index
    weighted_rate.loc[idx] = recent.loc[idx, "suicides_no"]*100_000/sum(recent.loc[idx, "population"])
recent = recent.join(weighted_rate)
recent = recent.groupby(by="country").sum()
recent = recent.sort_values(by=["weighted_suicide_rate"], ascending=False)

old = suicides[suicides["year"] == before_year]
weighted_rate = pd.Series(index=old.index, name="weighted_suicide_rate")
for country in pd.unique(old["country"]):
    idx = old[old["country"] == country].index
    weighted_rate.loc[idx] = old.loc[idx, "suicides_no"]*100_000/sum(old.loc[idx, "population"])
old = old.join(weighted_rate)
old = old.groupby(by="country").sum()
old = old.sort_values(by=["weighted_suicide_rate"], ascending=False)


# create plot of older and recent data sets
fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True, figsize = (30,20))
sns.barplot(y=old.index, x=old["weighted_suicide_rate"], ax = ax1)
sns.barplot(y=recent.index, x=recent["weighted_suicide_rate"], ax = ax2)

# make sure they use the same scale for easy comparison
mx = max(max(recent["weighted_suicide_rate"]), max(old["weighted_suicide_rate"]))
ax1.set_xlim(0,1.1*mx)
ax2.set_xlim(0,1.1*mx)

ax1.set_title(f"Before {before_year}")
ax2.set_title(f"After {after_year}")

ax1.invert_xaxis()
ax1.yaxis.tick_right()
ax2.yaxis.tick_left()
fig.show()

It appears that reweighting the suicide rates does not substantially change the graph. So the source of the 2016 spike is still a mystery.

In [None]:
countries = [
    "United States",
    #"Germany",
    #"France",
    #"United Kingdom",
    #"Canada",
    #"Japan",
    #"Singapore",
    #"Denmark",
    #"Sweden",
    #"Norway",
    #"Poland",
    #"Russian Federation",
    #"Finland",
    #"Uruguay",
    #"Guatemala",
    "Lithuania",
    "Republic of Korea"
]
#print(suicides["country"].apply(lambda x: x in countries))
country_time = suicides[suicides["country"].apply(lambda x: x in countries)].groupby(by=["country","year"]).mean()
country_time = country_time.reset_index(level = "country")

plt.figure(figsize=(10,5))

sns.lineplot(x=country_time.index, y=country_time["suicides/100k pop"], hue = country_time["country"])

plt.title("showing the relation between year and no of suicides cases")
plt.xticks(rotation=45)

Of these countries, the one with the most drastic overall increase in suicide rate is the Republic of Korea, though it's been going down in recent years. The US is currently, albeit very gradually increasing since 2006. Many of these countries don't appear to have data for 2016, making it difficult to analyze the 2016 spike.
Notably, the variation between years in the US has been much smaller than in the nordic countries. The variance is quite comparable to that of Canada and the UK.

However, it is definitely worth thinking about what might be driving the increase in suicide rates in the US. Income disparity, increasing debt, and feelings of career immobility might be related, but are difficult to analyze given just this data set. We can perhaps gain some insight into this by plotting by ages/generations.

## 2.3 Which age groups are committing suicide?

In [None]:
age1 = suicides.groupby(by="age").mean()

age2 = suicides[suicides["year"] >= 2011]
age2 = age2.groupby(by="age").mean()

age3 = suicides[suicides["year"] <= 1990]
age3 = age3.groupby(by="age").mean()

fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize =(20,5))
sns.barplot(x=age1.index, y=age1["suicides/100k pop"], ax=ax1)
sns.barplot(x=age2.index, y=age2["suicides/100k pop"], ax=ax2)
sns.barplot(x=age3.index, y=age3["suicides/100k pop"], ax=ax3)

ax1.tick_params(axis='x', labelrotation=45)
ax2.tick_params(axis='x', labelrotation=45)
ax3.tick_params(axis='x', labelrotation=45)

ax1.set_title("Global, all years")
ax2.set_title("Global, before 1990")
ax3.set_title("Global, after 2011")

This shows that the rate of suicides increases significantly among older people. This could be because they are getting older, and/or it could be a difference in culture and upbringing. The rate definitely slows for the 55-74 year age group, but the 75+ have a very high rate. 
If this is caused by simply being older, it could be assisted suicide, it could be having to deal with medical and health issues, or even dementia and neural degradation. The discomfort and sometimes humiliation involved with advanced age might be a motivating factor. 
As a society, we have been making great strides in psychology and mental health treatment and openness, meaning that younger people are more willing to be open about needing and asking for help. 

Since we see this same distribution across different groups of years, it suggests that it's more a function simply of age than of cultural upbringing.

In the previous section we saw that the US has had a steadily increasing rate since 2006, which I'd like to investigate further.

In [None]:
age_us = suicides[suicides["country"] == "United States"]
age_increasing_us1 = age_us[age_us["year"] <= 2006]
age_increasing_us2 = age_us[age_us["year"] >= 2006]

age_increasing_us1 = age_increasing_us1.groupby(by="age").mean()
age_increasing_us2 = age_increasing_us2.groupby(by="age").mean()

fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(15,5))
sns.barplot(x=age_increasing_us1.index, y=age_increasing_us1["suicides/100k pop"], ax=ax1)
sns.barplot(x=age_increasing_us2.index, y=age_increasing_us2["suicides/100k pop"], ax=ax2)

ax1.set_title("age of suicides in US before 2006")
ax2.set_title("age of suicides in US after 2006")

mx = max(max(age_increasing_us1["suicides/100k pop"]), max(age_increasing_us2["suicides/100k pop"]))
ax1.set_ylim(ymax=1.1*mx)
ax2.set_ylim(ymax=1.1*mx)

ax1.tick_params(axis='x', labelrotation=45)
ax2.tick_params(axis='x', labelrotation=45)

There is a notable dip in this 55-74 year age group for the US. But compared to the earlier years, it seems more like the 35-54 and 75+ age groups increased in that time, while the 55-74 stayed about the same.


## 2.4. How does suicide rate differ between sexes?
### a) by country?

In [None]:
male = suicides[suicides["sex"] >= "male"]
male = male.groupby(by="country").mean()
male = male.sort_values(by=["suicides/100k pop"], ascending=False)

female = suicides[suicides["sex"] <= "female"]
female = female.groupby(by="country").mean()
female = female.sort_values(by=["suicides/100k pop"], ascending=False)


fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True, figsize = (30,20))
sns.barplot(y=female.index, x=female["suicides/100k pop"], ax = ax1)
sns.barplot(y=male.index, x=male["suicides/100k pop"], ax = ax2)

mx = max(max(male["suicides/100k pop"]), max(female["suicides/100k pop"]))
ax1.set_xlim(0,1.1*mx)
ax2.set_xlim(0,1.1*mx)

ax2.set_title("MALE")
ax1.set_title("FEMALE")

ax1.invert_xaxis()
ax1.yaxis.tick_right()
ax2.yaxis.tick_left()
fig.show()

It's pretty well documented [[NPR](https://www.npr.org/sections/health-shots/2018/06/14/619338703/u-s-suicides-rates-are-rising-faster-among-women-than-men), [Wikipedia](https://en.wikipedia.org/wiki/Gender_differences_in_suicide), [BBC](https://www.bbc.com/future/article/20190313-why-more-men-kill-themselves-than-women)] that although women attempt suicide more frequently, men commit suicide at a higher rate than women. This additionally shows that the country that a person lives in has a similar impact regardless of sex. That is, the ordering of male suicide rates by country is the same as female. This would suggest that there is an approximate scalar proportion between the number of men vs women committing, and therefore that cultural, religious, political, ideological, and other societal factors do not affect men and women _qualitatively_ differently, alternatively, that those effects roughly cancel out in aggregate.

---

### b) by year?
Have there been different behaviors between the sexes globally over years?
If we break this down by countries, it seems likely that some years lows or highs could be related to recorded historical events.

In [None]:
male = suicides[suicides["sex"] >= "male"]
male = male.groupby(by="year").mean()
male = male.sort_values(by=["suicides/100k pop"], ascending=False)

female = suicides[suicides["sex"] <= "female"]
female = female.groupby(by="year").mean()
female = female.sort_values(by=["suicides/100k pop"], ascending=False)


fig, (ax1, ax2) = plt.subplots(nrows=2, sharey=True, figsize = (30,20))
sns.barplot(x=male.index, y=male["suicides/100k pop"], ax =ax1)
sns.barplot(x=female.index, y=female["suicides/100k pop"], ax =ax2)

mx = max(max(male["suicides/100k pop"]), max(female["suicides/100k pop"]))
ax1.set_ylim(0, 1.1*mx)
ax2.set_ylim(0, 1.1*mx)

ax1.set_title("MALE")
ax2.set_title("FEMALE")


fig.show()

## 2.5. What is the relation between suicide rate and GDP?
My first thought is that increasing GDP would decrease suicide rates, simply because wealthier countries _tend_ to have higher quality of life. That said, a study in Finland, where quality of life is very high, suggested that a major influencing factor is the _difference_ between the suicidal person's level of happiness compared to the happiness of those around them. As I understand it, this applies probably most frequently to those who are depressed, since their issues are less directly caused by the external factors that make up quality of life.

In [None]:
gdp = suicides

plt.figure(figsize=(10,5))
sns.scatterplot(x=np.log(gdp["gdp_for_year ($)"]), y=gdp["suicides/100k pop"])
plt.title("showing the relation between gdp and no of suicides cases")
plt.xticks(rotation=45)

In [None]:
gdp = suicides

plt.figure(figsize=(10,5))
sns.scatterplot(x=gdp["gdp_per_capita ($)"], y=gdp["suicides/100k pop"])
plt.title("showing the relation between gdp and no of suicides cases")
plt.xticks(rotation=45)

Based on both of these plots, it appears that there is not a consistent relationship between gdp or gdp per capita and the rate of suicides. It does appear that there is more variance among those entries with lower gdp, but this could be solely an artifact of the quantity of countries at those levels. To control for this, We can investigate the distribution of gdps.

In [None]:
gdp = suicides.groupby(by="country-year").mean()

plt.figure(figsize=(10,5))
sns.distplot(a=gdp["gdp_for_year ($)"])
plt.title("showing the distribution of gdp for countries and years")
plt.xticks(rotation=45)

In [None]:
gdp = suicides.groupby(by="country-year").mean()

plt.figure(figsize=(10,5))
sns.distplot(a=np.log(gdp["gdp_for_year ($)"]))
plt.title("showing the distribution of gdp for countries and years")

In [None]:
plt.figure(figsize=(10,5))
sns.distplot(a=gdp["gdp_per_capita ($)"])
plt.title("showing the distribution of gdp per capita")
plt.xticks(rotation=45)

Suspicion is confirmed, there are many more entries at lower gdps. This does not necessarily mean that the variance in the lower gdp range is solely because of the number of entries, but it does account for a large amount of it.

## 2.6. Does suicide rate correlate to country population?


In [None]:
pop = suicides.groupby(by="country-year").mean()

plt.figure(figsize=(10,5))
sns.scatterplot(x=pop["population"], y=pop["suicides/100k pop"])
plt.title("showing the relation between population and no of suicides cases")
plt.xticks(rotation=45)

No.