In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount("/content/drive/")
%cd '/content/drive/My Drive/DS320'
!pwd

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/My Drive/DS320
/content/drive/My Drive/DS320


In [None]:
# Load csv files
GDP_Happiness_2015 = pd.read_csv("Happiness_2015.csv")
GDP_Happiness_2016 = pd.read_csv("Happiness_2016.csv")
GDP_Happiness_2017 = pd.read_csv("Happiness_2017.csv")
GDP_Happiness_2018 = pd.read_csv("Happiness_2018.csv")
GDP_Happiness_2019 = pd.read_csv("Happiness_2019.csv")
Suicide_before_modified = pd.read_csv("who_suicide_statistics.csv")

In [None]:
# Pre-processing for GDP_Happiness Data : Remove unnecessary columns
GH2015 = GDP_Happiness_2015.drop(columns=['Region','Happiness Rank','Standard Error','Family', 'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)', 'Generosity', 'Dystopia Residual'])
GH2016 = GDP_Happiness_2016.drop(columns=['Region','Happiness Rank','Lower Confidence Interval','Upper Confidence Interval', 'Family', 'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)', 'Generosity', 'Dystopia Residual'])
GH2017 = GDP_Happiness_2017.drop(columns=['Happiness.Rank','Whisker.high','Whisker.low', 'Family', 'Health..Life.Expectancy.', 'Freedom', 'Trust..Government.Corruption.', 'Generosity', 'Dystopia.Residual'])
GH2018 = GDP_Happiness_2018.drop(columns=['Overall rank','Social support','Healthy life expectancy','Freedom to make life choices', 'Generosity', 'Perceptions of corruption'])
GH2019 = GDP_Happiness_2019.drop(columns=['Overall rank','Social support','Healthy life expectancy','Freedom to make life choices', 'Generosity', 'Perceptions of corruption'])
# Rename some columns with different name
GH2015 = GH2015.rename(columns={'Country':'country'})
GH2016 = GH2016.rename(columns={'Country':'country'})
GH2017 = GH2017.rename(columns={'Country':'country', 'Happiness.Score':'Happiness Score', 'Economy..GDP.per.Capita.':'Economy (GDP per Capita)'})
GH2018 = GH2018.rename(columns={'Country or region':'country', 'Score':'Happiness Score', 'GDP per capita':'Economy (GDP per Capita)'})
GH2019 = GH2019.rename(columns={'Country or region':'country', 'Score':'Happiness Score', 'GDP per capita':'Economy (GDP per Capita)'})
# Adding year variable into each dataframe
GH2015.insert(0, 'Year', '2015')
GH2016.insert(0, 'Year', '2016')
GH2017.insert(0, 'Year', '2017')
GH2018.insert(0, 'Year', '2018')
GH2019.insert(0, 'Year', '2019')

In [None]:
GH = pd.concat([GH2015], ignore_index=True)
GH

Unnamed: 0,Year,country,Happiness Score,Economy (GDP per Capita)
0,2015,Switzerland,7.587,1.39651
1,2015,Iceland,7.561,1.30232
2,2015,Denmark,7.527,1.32548
3,2015,Norway,7.522,1.45900
4,2015,Canada,7.427,1.32629
...,...,...,...,...
153,2015,Rwanda,3.465,0.22208
154,2015,Benin,3.340,0.28665
155,2015,Syria,3.006,0.66320
156,2015,Burundi,2.905,0.01530


In [None]:
# Pre-processing for Suicide Data : Remove unnecessary columns
Suicide = Suicide_before_modified.drop(columns=['sex', 'age'])
# Replace NaN values with zero(0)
Suicide = Suicide.replace(np.nan, 0)
# Calculate suicide rate
Suicide = Suicide.groupby(by=['country','year'], as_index = False).sum()
suicide_rate = (Suicide['suicides_no']/Suicide['population'])*100
Suicide.insert(2, 'suicide_rate(%)', suicide_rate)

In [None]:
# Select specific years that we need to investigate
Suicide_2015_16 = Suicide.loc[Suicide['year'] == 2015]
# Remove missing data(population should be larger than zero(0))
Suicide_Final = Suicide_2015_16.loc[Suicide_2015_16['population'] > 0]
Suicide_Final

Unnamed: 0,country,year,suicide_rate(%),suicides_no,population
30,Albania,2015,0.000000,0.0,2719684.0
87,Antigua and Barbuda,2015,0.001088,1.0,91889.0
124,Argentina,2015,0.007741,3073.0,39699624.0
158,Armenia,2015,0.002647,74.0,2795335.0
179,Aruba,2015,0.009117,9.0,98712.0
...,...,...,...,...,...
3448,Ukraine,2015,0.018773,7574.0,40345446.0
3491,United Kingdom,2015,0.008038,4910.0,61082942.0
3529,United States of America,2015,0.014726,44189.0,300078511.0
3564,Uruguay,2015,0.019744,630.0,3190795.0


In [None]:
#GH.loc[GH["country"]=="Moldova"]

Unnamed: 0,Year,country,Happiness Score,Economy (GDP per Capita)
51,2015,Moldova,5.889,0.59448


In [None]:
# Merge two dataframe with left-join
Joined_data = pd.merge(Suicide_Final, GH, on='country', how='left')
Joined_data

Unnamed: 0,country,year,suicide_rate(%),suicides_no,population,Year,Happiness Score,Economy (GDP per Capita)
0,Albania,2015,0.000000,0.0,2719684.0,2015,4.959,0.87867
1,Antigua and Barbuda,2015,0.001088,1.0,91889.0,,,
2,Argentina,2015,0.007741,3073.0,39699624.0,2015,6.574,1.05351
3,Armenia,2015,0.002647,74.0,2795335.0,2015,4.350,0.76821
4,Aruba,2015,0.009117,9.0,98712.0,,,
...,...,...,...,...,...,...,...,...
68,Ukraine,2015,0.018773,7574.0,40345446.0,2015,4.681,0.79907
69,United Kingdom,2015,0.008038,4910.0,61082942.0,2015,6.867,1.26637
70,United States of America,2015,0.014726,44189.0,300078511.0,,,
71,Uruguay,2015,0.019744,630.0,3190795.0,2015,6.485,1.06166


In [None]:
# To find heterogeneity data
Heterogeneity = Joined_data[Joined_data['Happiness Score'].isna()]
Heterogeneity

Unnamed: 0,country,year,suicide_rate(%),suicides_no,population,Year,Happiness Score,Economy (GDP per Capita)
1,Antigua and Barbuda,2015,0.001088,1.0,91889.0,,,
4,Aruba,2015,0.009117,9.0,98712.0,,,
9,Belize,2015,0.008129,26.0,319835.0,,,
11,Brunei Darussalam,2015,0.001823,7.0,384080.0,,,
15,Cuba,2015,0.013949,1511.0,10832068.0,,,
26,Grenada,2015,0.0,0.0,96892.0,,,
28,Hong Kong SAR,2015,0.014124,990.0,7009500.0,,,
31,Iran (Islamic Rep of),2015,0.003273,2372.0,72460999.0,,,
48,Puerto Rico,2015,0.006514,226.0,3469521.0,,,
50,Republic of Korea,2015,0.027757,13510.0,48671752.0,,,


In [None]:
pip install py_stringmatching



In [None]:
# Jaccard Similarity

import py_stringmatching as sm

def jaccard_sim(x,y,k):
  k_gram =sm.QgramTokenizer(qval= k, suffix_pad= "#")
  st1=k_gram.tokenize(x)
  st2=k_gram.tokenize(y)
  intersec = len((set(st1).intersection(st2)))
  union = (len(st1)+len(st2)) - intersec
  return float(intersec)/union



In [None]:
score = 1
for i in Heterogeneity["country"]:
  ll = []
  country = []
  score = 1
  for j in GH["country"]:
    ll.append(jaccard_sim(i,j,1))
    country.append(j)
  lll = sorted(ll)
  Heterogeneity["country"] = Heterogeneity["country"].replace([i], country[ll.index(lll[-1])])

    # if distance.jaccard(i,j) <= score:
    #   Heterogeneity["country"] = Heterogeneity["country"].replace([i],j)
    #   score = distance.jaccard(i,j)

Heterogeneity

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,country,year,suicide_rate(%),suicides_no,population,Year,Happiness Score,Economy (GDP per Capita)
1,Saudi Arabia,2015,0.001088,1.0,91889.0,,,
4,Austria,2015,0.009117,9.0,98712.0,,,
9,Brazil,2015,0.008129,26.0,319835.0,,,
11,Belarus,2015,0.001823,7.0,384080.0,,,
15,Colombia,2015,0.013949,1511.0,10832068.0,,,
26,Ireland,2015,0.0,0.0,96892.0,,,
28,Hong Kong,2015,0.014124,990.0,7009500.0,,,
31,Dominican Republic,2015,0.003273,2372.0,72460999.0,,,
48,South Africa,2015,0.006514,226.0,3469521.0,,,
50,Dominican Republic,2015,0.027757,13510.0,48671752.0,,,
