In [None]:
#Takes in country name, converts to the continent it resides on
def country_to_continent(country_name):
    country_code = pc.country_name_to_country_alpha2(country_name)
    continent_code = pc.country_alpha2_to_continent_code(country_code)
    continent_name = pc.convert_continent_code_to_continent_name(continent_code)
    return continent_name

Foreword: I mostly did this project for practice using finding/using different data visualization techniques. I used Liam Morgan's (R) breakdown of the same data as inspiration for some of the visualizations.

In [None]:
#Removes countries under a certain population size as well as those that reported in less than half the years of the dataset
def find_invalid(df):
    #Year just chosen as the variable to count, could use any of the columns here
    grouped = df.groupby("country")["population"].sum() / 35
    count_missing = df.groupby("country")["year"].count()
    
    grouped_small = grouped[grouped < 150000].index
    grouped_missing = count_missing[count_missing < 150]
    
    delete_list = grouped_small.append(grouped_missing.index)
    
    return delete_list

In [None]:
!pip install pycountry_convert

In [None]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import pycountry_convert as pc
import numpy as np
from scipy.stats import pearsonr
from numpy import mean

rates_data = pd.read_csv("../input/suicide-rates-overview-1985-to-2016/master.csv")

#To obtain the continent some countries needed to be renamed
rates_data["country"] = rates_data["country"].replace({"Republic of Korea":"Korea, Republic of"})
rates_data["country"] = rates_data["country"].replace({"Saint Vincent and Grenadines":"Saint Vincent and the Grenadines"})

#Get continent for each country and add it to the dataset
rates_data["Continent"] = rates_data.apply(lambda x: country_to_continent(x["country"]), axis = 1)

list_of_invalid = find_invalid(rates_data)
rates_data = rates_data[~rates_data["country"].isin(list_of_invalid)]
#rates_data = rates_data[rates_data["country"] != "Oman"]
rates_data = rates_data[rates_data["year"] < 2016]
rates_data["age"] = rates_data["age"].replace({"5-14 years":"05-14 years"})

print("Setup Complete")

- I removed countries that reported data in less than half the years available as well as those with very low populations as they lead to misleading results.
- Countries with too few reported years would have required too much synthetic data in many cases.
- Low population countries often had 0-2 cases for certain age groups and would lead to misleading large spikes.
- Used pycontryconvert to add a Continent column which was not present in the original dataset.

In [None]:
missing_vals = rates_data.isnull().sum().to_frame()
missing_vals = missing_vals.rename(columns = {missing_vals.columns[0]:"# Values"})

plt.figure(figsize = (8,4))
plt.title("Number of Missing Values in Data Fields")
sns.barplot(data = missing_vals, x = "# Values", y = missing_vals.index)
plt.show()

- The value of HDI (Human Development Index) is missing far too many values and will not be used
- Generation and GDP for year values are also not used in this breakdown
- Generation value is questionable in its utility, because it is a non-continuous variable with arbitrary cutoffs. It also suffers because it uses the age groups to determine which causes massive spikes in any graph when certain age groups 'leave' that generation.
- GDP per capita is a far better use to estimate a standard of living compared to GDP per year.

In [None]:
yearly_pop = rates_data.groupby("year")["population"].sum()
yearly_raw = rates_data.groupby("year")["suicides_no"].sum()

yearly_rate = yearly_raw / yearly_pop * 100000
average = yearly_rate.mean()

plt.figure(figsize = (14,6))
plt.title("Global Yearly Trend (Suicides per 100,000. (1985-2015))")
sns.set(style = "whitegrid")

g = sns.lineplot(data = yearly_rate, marker = "o")
g.set_xlim(left = 1985, right = 2015)
plt.axhline(y = average, c = "black")
plt.show()

- We can see an obvious almost absolute **increasing trend** in suicide rates **from 1985-1995**.
- This upward tend has a local low at 11.1 cases per 100,000 in the year 1985
- **1995** is the year when suicide **rates peaked**, reaching an average rate of **15.3 cases** per 100,000 people.

- **After 1995** we see a **slower, less steady decrease** in suicide rates
- With 1990 and 2005 both being closest to the average line it took nearly twice the time for the 1995 peak to decrease to that point
- Local minimum in 2015 with a rate of 11.9

- Due to a lack of information before 1985 we do not know where this starting trend began, or where the true long-term average is.

In [None]:
#Country Rates
country_pops = rates_data.groupby(["country", "Continent"])["population"].sum()
country_raw = rates_data.groupby(["country", "Continent"])["suicides_no"].sum()

country_rate = ((country_raw / country_pops) * 100000).to_frame().reset_index()
country_rate = country_rate.rename(columns = {0:"rate"})
country_rate = country_rate.sort_values(by = "rate", ascending = False)

average = country_rate["rate"].mean()

plt.figure(figsize = (10,28))
plt.title("Average Rate of Suicides per 100k by Country (1985-2015)")
sns.set(style = "whitegrid")

sns.barplot(data = country_rate, y = "country", x = country_rate["rate"], hue = "Continent", dodge = False)
plt.axvline(x = average, c = "black")
plt.show()

- The countries with above average rates/100k are predominantly European
- **Europe is overepresented** in the list as well with 27 of listed countries with the majority of these being **above** the global **average rate**
- **Africa is underrepresented** in this dataset with only 2 African countries which **leads to** some **misleading results** in later visualizations

- Some notable results include:
- Lithuania: 41.4 suicides per 100k (Highest in Europe as well as highest in the world)
- Kazakhstan (26.9), Japan (21.9), Korea (19.3) are significantly higher than the other Asian countries
- The United States (12.8), Australia (12.9) and Canada (13.0) all have very similar overall average rates
    
    

In [None]:
cont_pop = rates_data.groupby("Continent")["population"].sum()
cont_raw = rates_data.groupby("Continent")["suicides_no"].sum()
cont_rate = ((cont_raw / cont_pop) * 100000).to_frame()
cont_rate = cont_rate.rename(columns = {0: "Average_Rate"})
cont_rate = cont_rate.sort_values(by = "Average_Rate", ascending = False)

cont_year_pop = rates_data.groupby(["Continent", "year"])["population"].sum()
cont_year_raw = rates_data.groupby(["Continent", "year"])["suicides_no"].sum()
cont_year_rate = ((cont_year_raw / cont_year_pop) * 100000).to_frame().reset_index()
cont_year_rate = cont_year_rate.rename(columns = {0: "Average_Rate"})

average = cont_rate["Average_Rate"].mean()

sns.set_style("dark")
fig = plt.figure(constrained_layout=True, figsize = (12,10))
gs = fig.add_gridspec(4, 3)

ax1 = fig.add_subplot(gs[:3, :2])
ax1.set_title("Suicides per 100k Averaged by Continent (1985-2015)")

sns.barplot(ax = ax1, data = cont_rate, y = cont_rate.index, x = cont_rate["Average_Rate"], dodge = False)
ax1.set_ylabel(" ")
ax1.set_xlabel("Suicides per 100k")
ax1.axvline(x = average, c = "black")

ax2 = fig.add_subplot(gs[0, -1])
ax2.set_ylabel('Africa')
ax2.set_xlabel(" ")
sns.lineplot(ax = ax2, data = cont_year_rate.loc[cont_year_rate.Continent == "Africa"], y = "Average_Rate", x = "year", marker = "o")
ax2.get_xaxis().set_ticks([])

ax3 = fig.add_subplot(gs[1, -1])
ax3.set_ylabel('Asia')
ax3.set_xlabel(" ")
sns.lineplot(ax = ax3, data = cont_year_rate.loc[cont_year_rate.Continent == "Asia"], y = "Average_Rate", x = "year", marker = "o")
ax3.get_xaxis().set_ticks([])

ax4 = fig.add_subplot(gs[2, -1])
ax4.set_ylabel('Europe')
ax4.set_xlabel(" ")
sns.lineplot(ax = ax4, data = cont_year_rate.loc[cont_year_rate.Continent == "Europe"], y = "Average_Rate", x = "year", marker = "o")
ax4.get_xaxis().set_ticks([])

ax5 = fig.add_subplot(gs[3, -1])
ax5.set_ylabel("Oceania")
ax5.set_xlabel(" ")
sns.lineplot(ax = ax5, data = cont_year_rate.loc[cont_year_rate.Continent == "Oceania"], y = "Average_Rate", x = "year", marker = "o")
ax5.get_xaxis().set_ticks([])

ax6 = fig.add_subplot(gs[-1, -2])
ax6.set_ylabel('N. America')
ax6.set_xlabel(" ")
sns.lineplot(ax = ax6, data = cont_year_rate.loc[cont_year_rate.Continent == "North America"], y = "Average_Rate", x = "year", marker = "o")
ax6.get_xaxis().set_ticks([])

ax7 = fig.add_subplot(gs[-1, -3])
ax7.set_ylabel('S. America')
ax7.set_xlabel(" ")
sns.lineplot(ax = ax7, data = cont_year_rate.loc[cont_year_rate.Continent == "South America"], y = "Average_Rate", x = "year", marker = "o")
ax7.get_xaxis().set_ticks([])

- North America sits almost perfectly at the world average rate (10.5 suicides per 100k)
- The major change in Africa's rate is due to a very small sample size (2 countries) which did not consistently report throughout the entire 1985-2015 period.
- There are some **concerning trends** in both **South America, North America, and Oceania**.
- Though it's trend is alarming **South America's peak** (6.4 in 2015) is still **less** than **North American minimum** yearly average (9.6)
- Despite their very different trends Asia and Europe are at a similar rate in 2015

In [None]:
age_groups = rates_data.groupby("age", as_index = False).sum().eval("age_rate = (suicides_no / population) * 100000")
age_groups = age_groups.sort_values(by = "age")

plt.figure(figsize = (10,4))
plt.title("Global Yearly Trend for Age Groups (Suicides per 100,000. (1985-2015))")
g = sns.barplot(data = age_groups, x = age_groups["age"], y = age_groups["age_rate"], dodge = False)
g.set_ylabel(" ")
g.set_xlabel(" ")
plt.show()

- There is a clear relationship between age and suicide rates globally.

- The 5-14 rate is significantly lower than the nearest age group's rate (We see a jump of .62 to 9.3).

- This jump is 2-3 times as big as any other age group difference.

In [None]:
age_cont_pop = rates_data.groupby(["Continent", "age"])["population"].sum()
age_cont_raw = rates_data.groupby(["Continent", "age"])["suicides_no"].sum()

age_cont_rate = ((age_cont_raw / age_cont_pop) * 100000).to_frame().reset_index()
age_cont_rate = age_cont_rate.rename(columns = {0: "rate"})

cont_avg = age_cont_rate["rate"].mean()

plt.figure(figsize = (14,8))
g = sns.barplot(data = age_cont_rate, x = "Continent", y = "rate", hue = "age")
g.set_title("Suicides Rate per 100k by Age Group and Continent (1985-2015)")
g.set_ylabel("")
g.set_xlabel("")

plt.axhline(y = cont_avg, c = "black")
plt.show()

- **Africa and Oceania break the trend** of having the highest suicide rates **for the 75+ age group**. Both also have higher rates for (15-24) than their (55-74)
- South America has the most similar rate for the middle 4 age groups
- South America is almost entirely below the world's average rate, only surpassing it in the 75+ age group.
- **Asia and North America** have **similar (15-24)** age group rates. **Asia** has a far **more dramatic increase** as **age increases**.
- The **(5-14) age group** remains **consistently low** between all continents.

In [None]:
age_groups_pops = rates_data.groupby(["year", "age"])["population"].sum()
age_groups_raw = rates_data.groupby(["year", "age"])["suicides_no"].sum()

age_groups_rate = ((age_groups_raw / age_groups_pops) * 100000).to_frame().reset_index()
age_groups_rate = age_groups_rate.rename(columns = {0:"Age_Year_Rate"})

children_rate = age_groups_rate[age_groups_rate["age"] == "05-14 years"]
ya_rate = age_groups_rate[age_groups_rate["age"] == "15-24 years"]
adult_rate = age_groups_rate[age_groups_rate["age"] == "25-34 years"]
middle_rate = age_groups_rate[age_groups_rate["age"] == "35-54 years"]
senior_rate = age_groups_rate[age_groups_rate["age"] == "55-74 years"]
old_af = age_groups_rate[age_groups_rate["age"] == "75+ years"]

fig, axs = plt.subplots(2, 3, sharex = True, sharey = True, figsize = (14,6))
fig.suptitle("Suicide Rate per 100k Trends by Age Bracket (1985-2015)")

axs[0,0].plot(children_rate["year"], children_rate["Age_Year_Rate"], label = "5-14 Age Group")
axs[0,0].set_title("5-14 age group")

axs[0,1].plot(ya_rate["year"], ya_rate["Age_Year_Rate"], label = "15-24 Age Group")
axs[0,1].set_title("15-24 age group")

axs[0,2].plot(adult_rate["year"], adult_rate["Age_Year_Rate"], label = "25-34 Age Group")
axs[0,2].set_title("25-34 age group")

axs[1,0].plot(middle_rate["year"], middle_rate["Age_Year_Rate"], label = "35-54 Age Group")
axs[1,0].set_title("35-54 age group")

axs[1,1].plot(senior_rate["year"], senior_rate["Age_Year_Rate"], label = "55-74 Age Group")
axs[1,1].set_title("55-74 age group")

axs[1,2].plot(old_af["year"], old_af["Age_Year_Rate"], label = "75+ Age Group")
axs[1,2].set_title("75+ age group")

plt.show()

- The **1995 peak is visible** in the all age groups **except 5-14, and 75+ age groups**.
- The 4 **age groups where** the **peak is visible** all follow the same general **decreasing trend since 1995**
- We see a **significant decreasing trend** in the **75+ age group** from a rate of almost 30 to 20. (The most significan change of any age group)

In [None]:
highest_groups = rates_data[["year", "country-year", "suicides_no", "population", "sex", "age"]].copy()
highest_groups["rate"] = (highest_groups["suicides_no"] / highest_groups["population"]) * 100000
highest_groups.sort_values(by = "rate", inplace = True, ascending = False)
highest_top = highest_groups[:20].reset_index()
highest_top["y_label"] = highest_top["sex"] + " " + highest_top["country-year"]

g = sns.catplot(data = highest_top, x = "rate", y = "country-year", hue = "age", s = 10, height = 8)
g.set_axis_labels("Suicides per 100k Population. (Top 20 at Most Risk Age Groups Historically)", " ")
plt.show()

- 19 out of the 20 top highest risk age groups at a point in is in the 75+ age group
- Lithuanians aged 33-54 in 1996 is the only other group to be in the top 20
- Korean and Hungarians aged 75+ dominate the list with 17 out of the 20 spots held and are almost consistently the highest at risk group age group globally

In [None]:
cont_sex_pop = rates_data.groupby(["sex", "Continent"])["population"].sum()
cont_sex_cases = rates_data.groupby(["sex", "Continent"])["suicides_no"].sum()

rate_sex = pd.merge(left = cont_sex_pop, right = cont_sex_cases, on = ["sex","Continent"])
rate_sex["per_100k"] = (rate_sex["suicides_no"] / rate_sex["population"]) * 100000

rate_sex = rate_sex.reset_index()
rate_sex = rate_sex.sort_values(by = "per_100k", ascending = False)

plt.figure(figsize=(12,8))
g = sns.barplot(data = rate_sex, hue = "sex", x = "per_100k", y = "Continent",)
g.set_xlabel(" ")#Reported Suicide Cases Per 100k People")
g.set_ylabel("")
g.set_xlim(0,30)
g.set_title("Reported Suicide Cases Per 100k People by Continent/Sex (1985-2015)")
plt.show()

- In **Europe** we see the most **significant difference** in male and female suicide rates **(over 4x as high for males)**
- Europe and Asia have similar female rates but Europe's male rate is much higher
- Asia has the highest female suicide rates (8.56) which are almost equal to South America's male rate (8.63)

In [None]:
mf_pop = rates_data.groupby(["sex"])["population"].sum()
mf_case = rates_data.groupby(["sex"])["suicides_no"].sum()

mf_pop_year = rates_data.groupby(["sex", "year"])["population"].sum()
mf_case_year = rates_data.groupby(["sex", "year"])["suicides_no"].sum()

mf_rate = (mf_case / mf_pop) * 100000
mf_proportion = mf_case.copy()
mf_proportion[0] = mf_proportion[0] + mf_proportion[1]
mf_proportion[1] = mf_proportion[0]
mf_proportion = (mf_case / mf_proportion) * 100
mf_proportion = mf_proportion.round(1)

mf_rate_year = ((mf_case_year / mf_pop_year) * 100000).to_frame().reset_index()
mf_rate_year = mf_rate_year.rename(columns={mf_rate_year.columns[2]:'rate'})
m_rate_year = mf_rate_year.loc[mf_rate_year['sex']=='male']
f_rate_year = mf_rate_year.loc[mf_rate_year["sex"]=='female']

fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(16, 8))
labels = ["Female", "Male"]
explode = (.01, 0)
patches = ax1.pie(mf_proportion, explode, labels, labeldistance = .4, autopct='%1.1f%%')

ax1.set_title('Global proportion of suicide cases (1985-2015)')

ax2.plot(m_rate_year["year"], m_rate_year["rate"], color = "blue", label = "Male", marker = "o")
ax2.plot(f_rate_year["year"], f_rate_year["rate"], color = "orange", label = "Female", marker = "o")
ax2.set_title("Yearly suicides per 100k by sex. (1985-2015)")
ax2.set_xlabel(" ")
ax2.set_ylabel("Cases per 100k")
ax2.legend()

plt.xlim(left = 1985)
plt.ylim(bottom = 0, top = 25.0)
plt.show()

- Globally the **male** suicide **rate** is **significantly higher** than that of females. (over 4x as high)
- We see the **increase -> 1995 -> decrease** trend for **both sexes**
- The **trends/changes** are far **more dramatic** for the **male rate** which is clearly visible on the graph
- The ratio of male to female suicides was lower in the 80's but has remained consistently higher since the sharp male increase in 1990

In [None]:
#Get grouped series for population and rate
coun_sex_pop = rates_data.groupby(["sex", "country"])["population"].sum()
coun_sex_cases = rates_data.groupby(["sex", "country"])["suicides_no"].sum()

country_pop = rates_data.groupby(["country"])["population"].sum()
country_rate = rates_data.groupby(["country"])["suicides_no"].sum()

#Merge the grouped dataframes into a single, adding a column for rate per 100k
coun_rate_sex = (pd.merge(left = coun_sex_pop, right = coun_sex_cases, on = ["sex","country"])).reset_index()
coun_rate_sex["per_100k"] = (coun_rate_sex["suicides_no"] / coun_rate_sex["population"]) * 100000

#For combined numbers
country_combined = pd.merge(left = country_pop, right = country_rate, on = "country")
country_combined["Combined Rate"] = (country_combined["suicides_no"] / country_combined["population"]) * 100000

#pivot so male/female are colunms, sort by country
coun_rate_sex_pivot = coun_rate_sex.pivot(index = "country", values = "per_100k", columns = "sex")
coun_rate_sex_pivot = coun_rate_sex_pivot.sort_values(by = "country", ascending = False)

coun_rate_sex_pivot = pd.merge(left = coun_rate_sex_pivot, right = country_combined["Combined Rate"], on = "country")
coun_rate_sex_pivot = coun_rate_sex_pivot.sort_values(by = "Combined Rate")

#Get range for Y values for our lines/points
my_range=range(1,len(coun_rate_sex_pivot.index)+1)

#Figuresize and lines
plt.figure(figsize = (12,16))
ax = plt.hlines(y=my_range, xmin=coun_rate_sex_pivot['female'], xmax=coun_rate_sex_pivot['male'], color='grey', alpha=0.4)

#Male/Female dots
plt.scatter(coun_rate_sex_pivot['male'], my_range, color='navy', alpha=1, label='male')
plt.scatter(coun_rate_sex_pivot['female'], my_range, color='gold', alpha=1 , label='female')
plt.scatter(coun_rate_sex_pivot["Combined Rate"], my_range, color="green", alpha = 1, label = "Overall Rate")

plt.legend()
plt.yticks(my_range, coun_rate_sex_pivot.index)
plt.title("Suicide rates of Males/Females by country. (Rate Per 100k) (1985-2016)", loc='left')
plt.xlabel('# Suicides per 100k people.')
plt.ylabel(' ')
plt.grid(False, axis = "y")
plt.xlim(left = 0)
plt.margins(0.01)

plt.show()

- Males having a higher suicide rate is universal
- The countries with the highest overall rates appear to be more heavily skewed toward male suicide rates. This datapoint is further examined below. 

In [None]:
count_sex_pop = rates_data.groupby(["sex", "country"])["population"].sum()
count_sex_raw = rates_data.groupby(["sex", "country"])["suicides_no"].sum()

count_sex_rate = ((count_sex_raw / count_sex_pop) * 100000).to_frame().reset_index()
count_sex_rate = count_sex_rate.rename(columns = {0: "rate"})
count_sex_rate = count_sex_rate.pivot(index = "country", columns = "sex", values = "rate")
count_sex_rate["male"] = (count_sex_rate["male"] / (count_sex_rate["male"] + count_sex_rate["female"])) * 100
count_sex_rate["female"] = 100
count_sex_rate.sort_values(by = "male", inplace = True, ascending = False)

plt.figure(figsize = (8, 16))

g = sns.barplot(data = count_sex_rate, x = "female", y = count_sex_rate.index, color = "orange")
sns.barplot(data = count_sex_rate, x = "male", y = count_sex_rate.index, color = "blue")
g.set_title("Male/Female Proportion of Suicide Per Country (1985-2015)")
g.set_xlabel("Blue = Male Proportion, Orange = Female Proportion")
g.set_ylabel(" ")
g.set_xlim(0,100)

plt.show()

In [None]:
#Group our Data
mf_pop_country = rates_data.groupby(["sex", "country"])["population"].sum()
mf_case_country = rates_data.groupby(["sex", "country"])["suicides_no"].sum()
pop_country = rates_data.groupby(["country"])["population"].sum()
case_country = rates_data.groupby(["country"])["suicides_no"].sum()

#pivot turns multi-index into a nested DF [0] removes that abstraction, calculate rates
mf_rate_country = ((mf_case_country / mf_pop_country) * 100000).to_frame().reset_index()
mf_rate_country = (mf_rate_country.pivot(index = "country", columns = "sex"))[0]
rate_country = ((case_country / pop_country) * 100000).to_frame().reset_index()
rate_country = rate_country.rename(columns= {0: 'rate'})

#Calculate difference between M/F, sort dataframes
mf_rate_country["difference"] = mf_rate_country["male"] - mf_rate_country["female"]
mf_rate_country.sort_values("difference", inplace = True, ascending = False)
rate_country = rate_country.sort_values("rate", ascending = False)

rate_country_top = rate_country[:15]
mf_rate_country_top = mf_rate_country[:15].reset_index()
mf_rate_country_top["Country In Top 15 Overall Rate"] = np.where((mf_rate_country_top["country"]).isin(rate_country_top["country"]), 'Yes', 'No')

diff_avg = mf_rate_country["difference"].mean()

plt.figure(figsize = (10,6))
plt.title("Countries with the highest difference beteen Male and Female Suicide Rates (1985-2015)")

sns.set(style = "whitegrid")
g = sns.barplot(data = mf_rate_country_top, y = mf_rate_country_top["country"], x = mf_rate_country_top["difference"],
            hue = "Country In Top 15 Overall Rate", dodge = False)
g.set_ylabel("")
g.set_xlabel("Mean Yearly Difference between Male/Female Suicide Rates (per 100k)")
plt.axvline(x = diff_avg, c = "black")
plt.show()

- Of out 15 countries with the highest discrepancy between male and female suicide rates 13 are also in the top 15 highest overall
- This supports our theory that higher male suicide rates are more deterministic in the countries with higher rates


In [None]:
rates_country = (rates_data.groupby(["country", "Continent"])["suicides/100k pop"].mean()).reset_index()
gdp_country = (rates_data.groupby(["country", "Continent"])["gdp_per_capita ($)"].mean()).reset_index()

rates_gdp = pd.merge(left = gdp_country, right = rates_country, on = "country", suffixes = ["s", "b"])

x, y = rates_gdp["suicides/100k pop"], rates_gdp["gdp_per_capita ($)"]
g = sns.JointGrid(x=x,y=y, height = 10)

s = sns.scatterplot(data = rates_gdp, x=x, y=y, 
                s=150, 
                linewidth=1, 
                ax=g.ax_joint, 
                hue = "Continents", 
                ec = "black", 
                alpha = .8)
s.set_ylabel("GDP per Person")
s.set_xlabel("Country Average Suicides per 100k (1985-2015)")
g.plot_marginals(sns.histplot, kde=True)
plt.show()

- There appears to be a weak positive relationship between GDP per Capita and Suicide rates in a country (.0092)
- There also exist a great number of outliers in this relationship as well


In [None]:
diff_pop = rates_data.groupby(["country", "year"])["population"].sum()
diff_raw = rates_data.groupby(["country", "year"])["suicides_no"].sum()
diff_rate = ((diff_raw / diff_pop) * 100000).to_frame().reset_index()
diff_rate = diff_rate.rename(columns = {0: "rate"})

diff_rate_pivot = diff_rate.pivot(index = "year", columns = "country", values = "rate")
diff_rate_pivot = diff_rate_pivot.diff()

diff_rate_pivot = diff_rate_pivot.fillna(diff_rate_pivot.mean())

p_r_vals = diff_rate_pivot.apply(lambda vals: pearsonr(diff_rate_pivot.index, vals), result_type='expand')
p_r_vals = p_r_vals.rename(index = {0:"r-val", 1:"p-val"})
p_r_vals = p_r_vals.drop(labels = "r-val")

long_p_vals = p_r_vals.unstack()
stat_insig = (long_p_vals.loc[long_p_vals < .05]).reset_index()
long_p_vals = long_p_vals.loc[long_p_vals > .05]

In [None]:
diff_pop = rates_data.groupby(["country", "year"])["population"].sum()
diff_raw = rates_data.groupby(["country", "year"])["suicides_no"].sum()

diff_rate = ((diff_raw / diff_pop) * 100000).to_frame().reset_index()
diff_rate = diff_rate.rename(columns = {0: "rate"})

diff_rate_pivot = diff_rate.pivot(index = "year", columns = "country", values = "rate")
diff_rate_pivot = diff_rate_pivot.diff()
diff_rate_mean = diff_rate_pivot.mean()

diff_rate_mean = diff_rate_mean[~diff_rate_mean.index.isin(stat_insig["country"])]
diff_rate_mean = diff_rate_mean.to_frame()
diff_rate_mean = diff_rate_mean.rename(columns = {0:"Mean Difference"})
diff_rate_mean = diff_rate_mean.sort_values(by = "Mean Difference", ascending = False)

my_range=range(1,len(diff_rate_mean.index)+1)

#Figuresize and lines
plt.figure(figsize = (8,16))
ax = plt.hlines(y=my_range, xmin=0, xmax=diff_rate_mean, color='grey', alpha=0.4)

#Male/Female dots
plt.scatter(diff_rate_mean["Mean Difference"], my_range, color="green", s = 50)

plt.yticks(my_range, diff_rate_mean.index)
plt.title("Statisticaly Significant Linear Suicide Rate Average Change/Year (1985-2015)")
plt.xlabel("Average Rate Change per year (Suicides per 100k)")
plt.grid(False, axis = "y")
plt.axvline(x = 0, c = "black")
plt.margins(0.01)

plt.show()

- Used Pearsonr to determine **countries which have** at least a **relatively linear change over time** in suicide trends
- Well over half of the countries looked at have a negative slope, (47/78) (60.3%)
- This means **over 60% of countries** with a relatively linear relationship between time and suicide rate have **decreasing rates**.
- Highest positive rates (Guyana: .74), (Republic of Korea: .61), (Cyprus: .31)
- Highest negative rates (Estonia: -1.49), (Latvia: -1.20), (Hungary: -.63)
- With our highest decreasing rates being far above that of our highest increasing rates as well as a majority of countries having negative slopes outlook for future trends is very good.

In [None]:
yearly_pop = rates_data.groupby(["country", "year"])["population"].sum()
yearly_raw = rates_data.groupby(["country", "year"])["suicides_no"].sum()

yearly_rate = ((yearly_raw / yearly_pop) * 100000).to_frame().reset_index()

lowest_sloped = diff_rate_mean[-15:]

yearly_rate_low = yearly_rate[yearly_rate["country"].isin(lowest_sloped.index)]
yearly_rate_low = yearly_rate_low.rename(columns = {0: "Suicides per 100k"})

g = sns.relplot(data=yearly_rate_low, x="year", y="Suicides per 100k", hue="country", col="country", col_wrap = 5, legend = False, height = 2.5)
g.tight_layout(w_pad = 0)

In [None]:
yearly_pop = rates_data.groupby(["country", "year"])["population"].sum()
yearly_raw = rates_data.groupby(["country", "year"])["suicides_no"].sum()

yearly_rate = ((yearly_raw / yearly_pop) * 100000).to_frame().reset_index()

highest_sloped = diff_rate_mean[:15]

yearly_rate_low = yearly_rate[yearly_rate["country"].isin(highest_sloped.index)]
yearly_rate_low = yearly_rate_low.rename(columns = {0: "Suicides per 100k"})

g = sns.relplot(data=yearly_rate_low, x="year", y="Suicides per 100k", hue="country", col="country", col_wrap = 5, legend = False, height = 2.5)
g.tight_layout(w_pad = 0)

In [None]:
canada_comparisons = ("Canada", "France", "United Kingdom", "United States", "Sweden", "Italy")
compare_data = rates_data[rates_data["country"].isin(canada_comparisons)]
compare_data

In [None]:
age_cont_pop = compare_data.groupby(["country", "age"])["population"].sum()
age_cont_raw = compare_data.groupby(["country", "age"])["suicides_no"].sum()

age_cont_rate = ((age_cont_raw / age_cont_pop) * 100000).to_frame().reset_index()
age_cont_rate.rename(columns = {0: "rate"}, inplace = True)

cont_avg = age_cont_rate["rate"].mean()

plt.figure(figsize = (16,8))
g = sns.barplot(data = age_cont_rate, x = "country", y = "rate", hue = "age")
g.set_title("Suicides Rate per 100k by Age Group and Continent (1985-2015)")
g.set_ylabel("Cases per 100k")
g.set_xlabel("")

plt.axhline(y = cont_avg, c = "black")
plt.show()

In [None]:
def correlate(df, val1, val2, val1_is_binary):
    if(val1_is_binary) :
        r = stats.pointbiserialr(val1, val2)
    else :
        r = scipy.stats.pearsonr(df[val1], df[val2])
        
    print("Correlation between {} and {}: {:.3}, with a p-value of {:.3}".format(val1, val2, r[0], r[1]))

In [None]:
import scipy.stats
from sklearn.preprocessing import LabelEncoder

numerical_data = rates_data.copy()

correlate(numerical_data, "suicides/100k pop","gdp_per_capita ($)", False)
correlate(numerical_data, "gdp_per_capita ($)","suicides/100k pop", False)