In [None]:
import pandas as pd
import io

from IPython.display import display

In [None]:
dataframe = pd.read_csv("../input/world-happiness/2019.csv")
display(dataframe)

In [None]:
df = pd.read_csv("../input/worldhappiness2019clean/2019clean.csv")

In [None]:
display(df)

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['Region'].fillna(df['Region'].mode()[0], inplace=True)

mengganti missing value dengan nilai modus untuk kolom Region

In [None]:
df['Region'].mode()

In [None]:
df.Region

In [None]:
# cek kembali apakah masih ada missing value

df.isnull().sum()



---

## Statistik Deskriptif

---



Region manakah yang memiliki score kebahagiaan tertinggi dan terendah?

In [None]:
print("\n\nRegion in Dataset:\n")
print("There are {} different values\n".format(len(df.Region.unique())))
print(df.Region.unique())

In [None]:
region_lists=list(df['Region'].unique())
Region_happiness_ratio=[]
for each in region_lists:
    Region=df[df['Region']==each]
    region_happiness_rate=sum(Region.Score)/len(Region)
    Region_happiness_ratio.append(region_happiness_rate)
    
Region_data=pd.DataFrame({'Region':region_lists,'Region_happiness_ratio':Region_happiness_ratio})
new_index=(Region_data['Region_happiness_ratio'].sort_values(ascending=False)).index.values
sorted_data = Region_data.reindex(new_index)

sorted_data

In [None]:
#Visualization

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12,10))
sns.barplot(x=sorted_data['Region'], y=sorted_data['Region_happiness_ratio'],palette=sns.cubehelix_palette(len(sorted_data['Region'])))
# Place the region names at a 90-degree angle.
plt.xticks(rotation= 90)
plt.xlabel('Region')
plt.ylabel('Region Happiness Ratio')
plt.title('Happiness rate for regions')
plt.show()



---



## Statistik Analitik


### Clustering

In [None]:
import seaborn as sns

var = df[['Score','Economy','Social','Life', 'Freedom', 
          'Generosity','Trust','Region','Rank']]
cor = var.corr()
print(cor)
sns.heatmap(cor, square = True)

In [None]:
# menghapus variabel yang tidak terpakai

clean_df = df.drop(['Country', 'Region', 'Rank', 'Generosity'], axis=1)
clean_df.head()

In [None]:
# Normalisasi

from sklearn.preprocessing import StandardScaler
import numpy as np

X = clean_df.values[:,1:]
X = np.nan_to_num(X)
data_norm = StandardScaler().fit_transform(X)
data_norm

In [None]:
# menentukan k optimal

from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer


# Instantiate the clustering model and visualizer
visualizer = KElbowVisualizer(KMeans(), k=(2,10), timings=False)
visualizer.fit(clean_df)        # Fit the data to the visualizer

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer

# Instantiate the clustering model and visualizer
visualizer_silhouette1 = SilhouetteVisualizer(KMeans(2, random_state=42), colors='yellowbrick')
visualizer_silhouette2 = SilhouetteVisualizer(KMeans(3, random_state=42), colors='yellowbrick')

visualizer_silhouette1.fit(clean_df)        # Fit the data to the visualizer      
visualizer_silhouette2.fit(clean_df) 

0.7 < SC <= 1 Strong Stucture

0.5 < SC <= 0.7 Medium Structure

0.25 < SC <= 0.5 Weak Structure

SC <= 0.25 No structure

In [None]:
# Modeling

k_means = KMeans(init = "k-means++", n_clusters = 3, random_state = 42)
k_means.fit(clean_df)
labels = k_means.labels_
print(labels)

In [None]:
# memasukkan nilai cluster ke dalam dataframe

clean_df["Cluster"] = labels
clean_df

In [None]:
# cek nilai centroid

clean_df.groupby('Cluster').mean()

In [None]:
# visualisasi persebaran data setiap cluster

import seaborn as sns
facet = sns.lmplot(data=clean_df, x='Social', y='Score', hue='Cluster', 
                   fit_reg=False, legend=True, legend_out=True)

In [None]:
# viusalisasi 3D plot

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np


# viusalisasi 3D plot

# unique classes/groups in the data
number_of_classes = np.unique(labels).shape[0]

# the desired legends
legends = ['Cluster 0', 'Cluster 1', 'Cluster 2']

# colors for the groups
colors = ["b","g","r"]

fig1 = plt.figure() 
ax = Axes3D(fig1) 
ax.set_xlabel('Social')
ax.set_ylabel('Economy')
ax.set_zlabel('Score')
for i in range(number_of_classes):
    ax.scatter(clean_df.iloc[:, 2][labels==i], clean_df.iloc[:, 1][labels==i], clean_df.iloc[:, 0][labels==i], c = colors[i] ,s=50, label= legends[i])

plt.legend()
plt.show()

In [None]:
df['Cluster'] = clean_df['Cluster']
df.head()

### Classification

Hapus kolom selain Country

In [None]:
cdf = df.drop(['Rank','Score', 'Economy', 'Social', 'Life','Freedom','Generosity','Trust','Region','Cluster'], axis=1)

Dari proses clustering sebelumnya, diketahui bahwa kebanyakan negara pada tiap cluster memiliki karakteristik :

*   Cluster 0 : Indeks Kebahagiaan Tinggi (HIGH)
*   Cluster 1 : Indeks Kebahagiaan Rendah (LOW)
*   Cluster 2 : Indeks Kebahagiaan Cukup (FAIR)

Maka, ubah tipe kluster menjadi nama kelas Indeks Kebahagiaan Negara

In [None]:
def label_class (row):
   if row['Cluster'] == 1 :
      return 'Low'
   if row['Cluster'] == 0 :
      return 'High'
   if row['Cluster'] == 2 :
      return 'Fair'
   return 'Other'

df.apply (lambda row: label_class(row), axis=1)

Buat kolom baru untuk tipe Kelas 

In [None]:
cdf['Class'] = df.apply (lambda row: label_class(row), axis=1)
cdf

In [None]:
# Daftar negara yang memiliki indeks kebahagiaan TINGGI

high_df = cdf.loc[cdf['Class'] == 'High']
high_df

In [None]:
# Daftar negara yang memiliki indeks kebahagiaan CUKUP

fair_df = cdf.loc[cdf['Class'] == 'Fair']
fair_df

In [None]:
# Daftar negara yang memiliki indeks kebahagiaan RENDAH

low_df = cdf.loc[cdf['Class'] == 'Low']
low_df

JAWABAN :

In [None]:
#Jawaban (1) : Negara dengan indeks kebahagiaan TINGGI
high_list = high_df['Country'].tolist()
high_list

In [None]:
#Jawaban (2) : Negara dengan indeks kebahagiaan CUKUP
fair_list = fair_df['Country'].tolist()
fair_list

In [None]:
#Jawaban (3) : Negara dengan indeks kebahagiaan RENDAH
low_list = low_df['Country'].tolist()
low_list

In [None]:
# cek jumlah negara tiap kelas
data_concat=cdf.groupby('Class').count()
data_concat

In [None]:
#Buat Diagram Persentase Indeks kebahagian Negara-Negara di dunia
pie_df=pd.pivot_table(data_concat, index = 'Class', values="Country")
#pie chart
pie_df.dropna(inplace = True)
labels =pie_df.index
colors = ['yellow','green','red']
explode = [0,0,0]
sizes = pie_df.values

# visual
plt.figure(figsize = (7,7))
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%')
plt.title('Persentase indeks kebahagian Negara-Negara di dunia',fontsize = 20)
plt.show()

### Regression

**Skor Kebahagiaan - Ekonomi**

Pertama mari kita amati hubungan antara Ekonomi dan skor kebahagiaan dengan bantuan grafis.
*   variabel independen : x
*   variabel dependen : y




In [None]:
df.head()

In [None]:
data = df.copy()
data = data.select_dtypes(include=["float64","int64"])
data.head()

In [None]:
df_table = df.drop(['Region', 'Rank', 'Cluster'], axis=1)
df_table.head()

In [None]:
sns.jointplot(x="Economy",y="Score",data=df_table,kind="reg")
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression

X = df_table[["Economy"]]
X.head

In [None]:
y = df_table[["Score"]]
y.head

In [None]:
reg = LinearRegression()
model = reg.fit(X,y)
print("intercept: ", model.intercept_)
print("coef: ", model.coef_)
print("rscore. ", model.score(X,y))

In [None]:
# Prediksi suatu nilai sesuai inputan

def linear_reg(col,text,prdctn):
    
    sns.jointplot(x=col,y="Score", data=df_table, kind="reg")
    plt.show()
    
    X = data[[col]]
    y = data[["Score"]]
    reg = LinearRegression()
    model = reg.fit(X,y)
    
    # prediction
    plt.figure(figsize=(12,6))
    g = sns.regplot(x=df_table[col],y=df_table["Score"],ci=None,scatter_kws = {'color':'r','s':9})
    g.set_title("Model Equation")
    g.set_ylabel("Score")
    g.set_xlabel(col)
    plt.show()
    
    print(text,": ", model.predict([[prdctn]]))

In [None]:
# Mencari data nilai tertinggi dari ekonomi
df_table.nlargest(1,'Economy') 

In [None]:
# Memasukkan nilai Ekonomi yang sudah di dapat untuk ditampilkan beserta skor kebahagiaannya 

linear_reg("Economy","Skor kebehagiaan Negera dengan Ekonomi tertinggi adalah",1.684)

In [None]:
# Mencari data nilai terendah dari ekonomi
df_table.nsmallest(1,'Economy') 

In [None]:
linear_reg("Economy","Skor kebehagiaan Negera dengan Ekonomi terendah adalah",0.0)