In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
pd.set_option('display.max_columns', 100)
encoding_latin="latin"
df = pd.read_csv('/kaggle/input/world-happiness/2019.csv', low_memory = False, encoding = encoding_latin)
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df_happy = round(100*(df.isnull().sum()/len(df)), 2)
df_happy

In [None]:
top5_counrties_2019 = df.sort_values(by='Healthy life expectancy', ascending=False).head(5)
top5_counrties_2019

In [None]:
top5_counrties_Social_2019 = df.sort_values(by='Social support', ascending=False).head(5)
top5_counrties_Social_2019

In [None]:
df = df.drop(['Overall rank', 'Country or region'], axis = 1)
df.head()

In [None]:
sns.pairplot(df)

In [None]:
corr=df.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
df_2019 = df.copy()
df_2019.head()

Splitting the Data into Training and Testing Sets

In [None]:
from sklearn.model_selection import train_test_split
df_2019_train, df_2019_test = train_test_split(df_2019, train_size = 0.7, test_size = 0.3, random_state = 100)

Rescaling the Features

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [None]:
num_vars = ['Score', 'GDP per capita', 'Social support']
df_2019_train[num_vars] = scaler.fit_transform(df_2019_train[num_vars])
df_2019_train.head()

In [None]:
y_train = df_2019_train.pop('Score')
X_train = df_2019_train

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

In [None]:
lm = LinearRegression()
lm.fit(X_train, y_train)

rfe = RFE(lm, 10)             # running RFE
rfe = rfe.fit(X_train, y_train)

In [None]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

In [None]:
col = X_train.columns[rfe.support_]
col

In [None]:
X_train.columns[~rfe.support_]

In [None]:
# Creating X_test dataframe with RFE selected variables
X_train_rfe = X_train[col]

In [None]:
# Adding a constant variable 
import statsmodels.api as sm  
X_train_rfe = sm.add_constant(X_train_rfe)

In [None]:
lm = sm.OLS(y_train,X_train_rfe).fit() 

In [None]:
print(lm.summary())

Generosity is insignificant in presence of other variables

In [None]:
X_train_new = X_train_rfe.drop(["Generosity"], axis = 1)

In [None]:
import statsmodels.api as sm  
X_train_lm = sm.add_constant(X_train_new)

In [None]:
lm = sm.OLS(y_train,X_train_lm).fit()

In [None]:
print(lm.summary())

In [None]:
X_train_new.columns

In [None]:
X_train_new = X_train_new.drop(['const'], axis=1)

In [None]:
# Calculate the VIFs for the new model
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Residual Analysis of the train data

In [None]:
y_train_score = lm.predict(X_train_lm)

In [None]:
# Importing the required libraries for plots.
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Plot the histogram of the error terms
fig = plt.figure()
sns.distplot((y_train - y_train_score), bins = 20)
fig.suptitle('Error Terms', fontsize = 20)            
plt.xlabel('Errors', fontsize = 18)  

Making Predictions

Applying the scaling on the test sets

In [None]:
num_vars = ['Score', 'GDP per capita', 'Social support']

df_2019_test[num_vars] = scaler.transform(df_2019_test[num_vars])
df_2019_test[num_vars].head()

Dividing into X_test and y_test

In [None]:
y_test = df_2019_test.pop('Score')
X_test = df_2019_test

In [None]:
# Now let's use our model to make predictions.

# Creating X_test_new dataframe by dropping variables from X_test
X_test_new = X_test[X_train_new.columns]

# Adding a constant variable 
X_test_new = sm.add_constant(X_test_new)

In [None]:
# Making predictions
y_pred = lm.predict(X_test_new)
y_pred.head()

Model Evaluation

In [None]:
# Plotting y_test and y_pred to understand the spread.
fig = plt.figure()
plt.scatter(y_test,y_pred)
fig.suptitle('y_test vs y_pred', fontsize=20)          
plt.xlabel('y_test', fontsize=18)                      
plt.ylabel('y_pred', fontsize=16) 

In [None]:
df.columns

In [None]:
from sklearn import linear_model
X = df[['GDP per capita', 'Social support', 'Healthy life expectancy','Freedom to make life choices', 'Generosity','Perceptions of corruption']]
y = df['Score']

lm = linear_model.LinearRegression()
model = lm.fit(X,y)
lm.coef_

In [None]:
lm.intercept_