# Upload Data, Set the Environment

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
print(os.listdir('/kaggle/input'))

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
from sklearn import linear_model
# Load the data
life_sat = pd.read_csv("/kaggle/input/life-satisfaction-dataset/oecd_bli_2015.csv", thousands=',')
gdp_per_capita = pd.read_csv("/kaggle/input/life-satisfaction-dataset/gdp_per_capita.csv", thousands=',', delimiter='\t', encoding='latin1', na_values="n/a", index_col="Country")

# Clean the Data

Then I cleaned the data using codes provided in GitHub.

In [None]:
life_sat_total = life_sat[life_sat["INEQUALITY"]=="TOT"]

In [None]:
life_sat_total = life_sat_total.pivot(index="Country", columns="Indicator", values="Value")

In [None]:
life_sat_total.head()

Then, I cleaned gdp per capita data to concatenate with life satisfaction data.

In [None]:
gdp_per_capita.rename(columns={"2015": "GDP per capita"}, inplace=True)

In [None]:
gdp_per_capita.head()

# Merge 2 sources

I merged 2 data sources namely, GDP per capita and Life satisfaction based on their "Country" attribute to correlate their values and gain insights.

In [None]:
country_stats = pd.merge(gdp_per_capita, life_sat_total, how='inner', on="Country", left_on=None, right_on=None,
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False,
         validate=None)


In [None]:
data = country_stats[["GDP per capita", "Life satisfaction"]]

In [None]:
data.head()

# Visualize the Data

Now, I can convert data to numpy arrays to do numerical computations and visualizations.

In [None]:
X = np.c_[data["GDP per capita"]]
y = np.c_[data["Life satisfaction"]]

In [None]:
data.plot(kind='scatter', x="GDP per capita", y='Life satisfaction')
plt.show()

I applied linear regression on this data, even though there is an outlier data which can impact the performance of the model.

In [None]:
lin_reg_model = sklearn.linear_model.LinearRegression()
lin_reg_model.fit(X, y)

In [None]:
lin_reg_model.coef_

In [None]:
X_new = [[22587]] # Cyprus' GDP per capita
print(lin_reg_model.predict(X_new))

In this model, I applied GDP of Cyprus to predict its life satisfaction score. Based on the model, the expected life satisfaction score of Cyprus is 6.28. Now, I can remove the outlier in data, and see if the prediction of my model changes.

In [None]:
filtered_outliers = (data["GDP per capita"] < 70000)

In [None]:
filtered_outliers

In [None]:
new_df = data[filtered_outliers]

In [None]:
new_df

# Outliers are Removed

I excluded the countries with more than 70.000 GDP per capita to get a better model. Norway, Switzerland and Luxembourg are removed from the data. Now, I can test my model again to see if there is any change.

In [None]:
new_df.plot(kind='scatter', x="GDP per capita", y='Life satisfaction')
plt.show()

In [None]:
X = np.c_[new_df["GDP per capita"]]
y = np.c_[new_df["Life satisfaction"]]
lin_reg_model = sklearn.linear_model.LinearRegression()
lin_reg_model.fit(X, y)

In [None]:
lin_reg_model.coef_

In [None]:
X_new = [[22587]] # Cyprus' GDP per capita
print(lin_reg_model.predict(X_new))

After removing the outliers, now my model predicts life satisfaction of Cyprus as 6.22 points.

In [None]:
new_df.mean()