In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

This is my first project on Kaggle, I want to practice some useful techniques, that I've learned from the first two chapters of the "Hands-on Machine Learning" by Aurelien Geron. 

First I compare columns in train and test data.

In [None]:
data = pd.read_csv("/kaggle/input/world-happiness-report-2021/world-happiness-report.csv")
data.head()

In [None]:
test = pd.read_csv("/kaggle/input/world-happiness-report-2021/world-happiness-report-2021.csv")
test.head()

I drop some columns to make my train data and test data consistent. Also I don't think that country names and years can provide us with useful information.

In [None]:
test_cols = ['Ladder score','Logged GDP per capita','Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption']
data_test = test[test_cols]
data_test.head()

In [None]:
data = data.drop(["Country name", "year", "Positive affect", "Negative affect"], axis=1)
data.head()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
sns.heatmap(data.isnull(), yticklabels=False)

As we can see, almost all attributes miss some values, even though it's not much. We can keep all attributes and fill them with mean or median values calculated for every attribute. 

In [None]:
sns.set_style('darkgrid')
data.hist(figsize=(15, 12))

In [None]:
sns.heatmap(data.corr(), annot=True, vmin=-1, vmax=1)

In [None]:
sns.pairplot(data)

I've noticed that some features have strong relationships not only with our target variable but also with other features. For example, "Healthy life expectancy at birth" is strongly correlated with "Life ladder", but even stronger it is correlated with "Log GDP per capita". I wonder if such relationships between features would reduce performance of our prediction model. To test this idea i would create an alternative dataset - data_red. 

In [None]:
data_red = data.drop(["Social support", "Healthy life expectancy at birth"], axis=1)
data_test_red = data_test.drop(['Social support', 'Healthy life expectancy'], axis=1)

In [None]:
sns.heatmap(data_red.corr(), annot=True, vmin=-1)

Now it's time to prepare our datasets for machine learning algorithms.
First of all i'll separate predictors from responses.

In [None]:
data_predictors = data.drop("Life Ladder", axis=1)
data_red_predictors = data_red.drop("Life Ladder", axis=1)
data_labels = data["Life Ladder"].copy()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

I create a pipeline to fill missing values with median values and to standardize variables in one step. That is also useful to apply to test data.

In [None]:
pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

In [None]:
pipeline_red = Pipeline([
        ('imputer_red', SimpleImputer(strategy="median")),
        ('std_scaler_red', StandardScaler()),
    ])

In [None]:
data_train = pipeline.fit_transform(data_predictors)

In [None]:
data_red_train = pipeline_red.fit_transform(data_red_predictors)

Now our train data is properly scaled and can be used to train the model. I choose linear regression to start with something simple.

In [None]:
data_train

In [None]:
data_red_train

I use two separate models. One for data and another one for data_red. After that I'm going to compare their performance in order to find out whether reduction was necessary or not.

In [None]:
from sklearn.linear_model import LinearRegression

lin = LinearRegression()
lin.fit(data_train, data_labels)

In [None]:
lin_red = LinearRegression()
lin_red.fit(data_red_train, data_labels)

In [None]:
some_data = data_test.drop("Ladder score", axis=1)
some_labels = data_test["Ladder score"]
some_data_prepared = pipeline.transform(some_data)
predictions = lin.predict(some_data_prepared)

In [None]:
some_data_red = data_test_red.drop("Ladder score", axis=1)
some_labels_red = data_test_red["Ladder score"]
some_data_prepRed = pipeline_red.transform(some_data_red)
predictions_red = lin_red.predict(some_data_prepRed)

According to RMSE (root mean squared error) the reduced model is slightly worse than the "full" one.

In [None]:
from sklearn.metrics import mean_squared_error

lin_mse = mean_squared_error(some_labels, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
lin_red_mse = mean_squared_error(some_labels_red, predictions_red)
lin_red_rmse = np.sqrt(lin_red_mse)
lin_red_rmse

After RMSE I've applied cross validation and according to this method the reduced model shows worse results as well. 

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(lin, some_data_prepared, some_labels,
                         scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-scores)

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(lin_rmse_scores)

In [None]:
scores_red = cross_val_score(lin_red, some_data_prepRed, some_labels,
                         scoring="neg_mean_squared_error", cv=10)
linRed_rmse_scores = np.sqrt(-scores_red)
display_scores(linRed_rmse_scores)

Even though both models are far from being perfect for predicting happiness, the most straightforward model is slightly better and could have been used directly without any reductions. That is an interesting result for me because I've thought that all strong relationships between predictors are generally bad for the model performance.