In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from numpy import mean, std

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
import os
print(os.listdir("../input"))

In [None]:
print('Given two data sets, both including information about countries quality of living and their happiness score.')
print('One data set contains data from the early 2000s up until 2020, and the other includes only data from 2021.')
print('The data set was taken from Kaggle.com with little information on how the Happiness score was calculated.')
print('Both data sets share many of the same columns but there are many differences as well.')
print('Below I created two models for predicting the Happiness score of 2021.')
print('This first model uses the most recent years data to predict the 2021 score.')
print('The second model uses the 2021 data to predict the 2021 score.')

In [None]:
df=     pd.read_csv('/kaggle/input/world-happiness-report-2021/world-happiness-report.csv')
df2021= pd.read_csv('/kaggle/input/world-happiness-report-2021/world-happiness-report-2021.csv')

In [None]:
print(df.shape)
print(df.columns)
#print(df)
print(df2021.shape)
print(df2021.columns)
#print(df2021)

In [None]:
grouped=df.groupby('Country name')
most_recent_year=grouped.max()
df=most_recent_year
df.dropna().reset_index()
#print(df)

In [None]:
# Merging 2021 data onto the previous years data, and selecting the useful features.
# This model is using previous year's data to predict the 2021 happiness score
merged=pd.merge(df,df2021[['Country name','Ladder score']], on='Country name')
useful_df=merged.iloc[:,2:12]
useful_df.fillna(0, inplace=True)
#print(useful_df.isna().sum())

plt.scatter(useful_df['Life Ladder'],useful_df['Ladder score'])
plt.title('2021 Scores Compared to Most Recent Year Score')
plt.xlabel('Most Recent Score')
plt.ylabel('2021 Score')
plt.show()

In [None]:
# Standardizing all values in the data frame that includes the features and happiness score.
scaler=StandardScaler()
standardized_df=scaler.fit_transform(useful_df)
standardized_df=pd.DataFrame(standardized_df, columns=useful_df.columns)
#print(standardized_df)
corrMatrix = standardized_df.corr()
sn.heatmap(corrMatrix, annot=True)
plt.show()

In [None]:
# Building the linear regression model using standardized data from the most previous year
x=standardized_df.iloc[:,0:9]
y=standardized_df['Ladder score']

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8, test_size = 0.2, random_state=10)
futuremodel = LinearRegression()
futuremodel.fit(x_train, y_train)
y_predict= futuremodel.predict(x_test)

print("Train score (R-squared):")
print(futuremodel.score(x_train, y_train))

print('')

print("Test score (R-squared):")
print(futuremodel.score(x_test, y_test))

# Finding optimal K-Folds for cross validation
print('')

print('Using 2 through 12 KFold cross validations to improve model:')
print('')
scores=[]
for k in range(0,11):
    KFolds = KFold(n_splits=k+2, random_state=1, shuffle=True)
    score=cross_val_score(futuremodel, x_train, y_train,scoring='r2', cv=KFolds)
    scores.append(mean(score))
folds=pd.DataFrame()
folds['K-Folds']=list(range(2,13))
folds['scores']=scores
print(folds)
print('')
print('The optimal K-Folds is 7 with an R-Squared score of 0.7784')
print('')
print('Average R-Squared Score: %2.4f' % (mean(scores)))
print('Maximum R-Squared Score: %2.4f' % (max(scores)))
print('Minimum R-Squared Score: %2.4f' % (min(scores)))
plt.plot(scores)
plt.show()

In [None]:
residuals = y_predict - y_test
 
plt.scatter(y_predict, residuals, alpha=0.4)
plt.title('Residual Analysis')
plt.show()
print('Coefficient List : ')
print(futuremodel.coef_)

In [None]:
# Selecting features and happiness score from the 2021 data frame

#print(df2021.columns)
useful_df2021=df2021[['Ladder score', 'Logged GDP per capita','Social support','Healthy life expectancy', 'Freedom to make life choices','Generosity','Perceptions of corruption']]  
useful_df2021.fillna(0, inplace=True)
#print(useful_df2)

# Standardizing all values in the data frame that includes the features and happiness score.

scaler=StandardScaler()
standardized_df2021=scaler.fit_transform(useful_df2021)
standardized_df2021=pd.DataFrame(standardized_df2021, columns=useful_df2021.columns)
#print(standardized_df2021)
corrMatrix = standardized_df2021.corr()
sn.heatmap(corrMatrix, annot=True)
plt.show()

In [None]:
# Building the linear regression model using standardized data from the most previous year
# 
x=standardized_df2021.iloc[:,1:7]
y=standardized_df2021['Ladder score']

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8, test_size = 0.2, random_state=10)
presentmodel = LinearRegression()
presentmodel.fit(x_train, y_train)
y_predict= presentmodel.predict(x_test)

print("Train score (R-squared):")
print(presentmodel.score(x_train, y_train))

print('')

print("Test score (R-squared):")
print(presentmodel.score(x_test, y_test))

# Finding optimal K-Folds for cross validation
print('')

print('Using 2 through 12 KFold cross validations to improve model:')

print('')

scores=[]
for k in range(0,11):
    KFolds = KFold(n_splits=k+2, random_state=1, shuffle=True)
    score=cross_val_score(presentmodel, x_train, y_train,scoring='r2', cv=KFolds)
    scores.append(mean(score))
folds=pd.DataFrame()
folds['K-Folds']=list(range(2,13))
folds['scores']=scores
print(folds)

print('')

print('The optimal K-Folds is 5 with an R-Squared score of 0.7125')

print('')

print('Average R-Squared Score: %2.4f' % (mean(scores)))
print('Maximum R-Squared Score: %2.4f' % (max(scores)))
print('Minimum R-Squared Score: %2.4f' % (min(scores)))
plt.plot(scores)
plt.show()

In [None]:
residuals = y_predict - y_test
 
plt.scatter(y_predict, residuals, alpha=0.4)
plt.title('Residual Analysis')
plt.show()
print('Coefficient List : ')
print(presentmodel.coef_)

In [None]:
print('Surprisingly, the data from the most previous year seems to be slightly better at predicting the 2021 Happiness scores than the data from 2021.')
print('This could be attributed to the additional features that are included in the data set prior to 2021.')
print('These additional features include, the previous years score, and the positive and negative affects.')
print('The forward-predictive model could have also scored better because it is possible that the happiness rating was determined with this previous data, or even through forecasting.')