In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.linear_model
import matplotlib as mpl

datapath = os.path.join('datasets', 'lifesat', '')
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [None]:
# oecd_bill_url = 'https://raw.githubusercontent.com/ageron/handson-ml/refs/heads/master/datasets/lifesat/oecd_bli_2015.csv'
# gdp_per_capita_url = 'https://raw.githubusercontent.com/ageron/handson-ml/refs/heads/master/datasets/lifesat/oecd_bli_2015.csv'
# oecd_bil_df = pd.read_csv(oecd_bill_url, thousands=',')
# gdp_per_capita_df = pd.read_csv(gdp_per_capita_url, thousands=',')

In [None]:
# Download data
import urllib.request

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
os.makedirs(datapath, exist_ok=True)
for filename in ("oecd_bli_2015.csv", "gdp_per_capita.csv"):
    print("Downloading...", filename)
    url = DOWNLOAD_ROOT + "datasets/lifesat/" + filename
    urllib.request.urlretrieve(url, datapath + filename)

In [None]:
# Load the data
oecd_bli = pd.read_csv(datapath + "oecd_bli_2015.csv", thousands=',')
gdp_per_capita = pd.read_csv(datapath + "gdp_per_capita.csv",thousands=',',delimiter='\t',
                             encoding='latin1', na_values="n/a")

In [None]:
oecd_bli.head(5)
gdp_per_capita.head(5)

In [None]:
# gdp_per_capita.head(5)

## Pivot -> rearranging or reshaping the data

# df = pd.DataFrame({
#     'Date': ['2023-01', '2023-01', '2023-02', '2023-02'],
#     'Product': ['A', 'B', 'A', 'B'],
#     'Sales': [100, 150, 200, 250]
# })
# df = df.pivot(index="Date", columns="Product", values="Sales")
# df = df.pivot(index="Product", columns="Date", values="Sales")
# df = df.pivot(index='Date', columns="Sales", values="Product")
# df
# df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
# df.rename(columns={"Date": "dates", "Product": '_products_', "Sales": '--sale--'}, inplace=False)

In [None]:

def prepare_country_stats(oecd_bli, gdp_per_capita):
    oecd_bli = oecd_bli[oecd_bli["INEQUALITY"]=="TOT"]
    oecd_bli = oecd_bli.pivot(index="Country", columns="Indicator", values="Value")
    gdp_per_capita.rename(columns={"2015": "GDP per capita"}, inplace=True)
    gdp_per_capita.set_index("Country", inplace=True)
    full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita,
                                  left_index=True, right_index=True)
    full_country_stats.sort_values(by="GDP per capita", inplace=True)
    remove_indices = [0, 1, 6, 8, 33, 34, 35]
    keep_indices = list(set(range(36)) - set(remove_indices))
    return full_country_stats[["GDP per capita", 'Life satisfaction']].iloc[keep_indices]

In [None]:
country_stat = prepare_country_stats(oecd_bli, gdp_per_capita)

In [None]:
country_stat.head(5)

In [None]:
X = np.c_[country_stat['GDP per capita']]
y = np.c_[country_stat['Life satisfaction']]

# temp = np.c_[country_stat['GDP per capita'], country_stat['Life satisfaction']]

In [None]:
# Visualize the data
country_stat.plot(kind='scatter', x='GDP per capita', y='Life satisfaction')
plt.show()

# select the model
model = sklearn.linear_model.LinearRegression()

# train model
model.fit(X, y)

# make new prediction for Cyprus
X_new = [[22587]] # cyprus GDP per capita
print(model.predict(X_new))

In [None]:
from sklearn.neighbors import KNeighborsRegressor

model1 = KNeighborsRegressor(n_neighbors=3)

# train the model
model1.fit(X, y)

# Make prediction
print(model1.predict(X_new))

# Load and prepare life statisfaction data

In [None]:
oecd_bli = pd.read_csv(datapath+'oecd_bli_2015.csv', thousands=',')
oecd_bli = oecd_bli[oecd_bli['INEQUALITY'] == 'TOT']
oecd_bli = oecd_bli.pivot(index="Country", columns="Indicator", values="Value")
oecd_bli['Life satisfaction'].head()

# Load and prepare GDP per capita data

In [None]:
gdp_per_capita = pd.read_csv(datapath+'gdp_per_capita.csv', thousands=',', delimiter='\t', encoding='latin1', na_values='n/a')
gdp_per_capita.rename(columns={"2015": "GDP per capita"}, inplace=True)
gdp_per_capita.set_index('Country', inplace=True)

full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita, left_index=True, right_index=True)

In [None]:
full_country_stats[["GDP per capita", 'Life satisfaction']].loc['United States']

In [None]:
remove_indices = [2, 1, 6, 8, 33, 34, 32]
keep_indices = list(set(range(36)) - set(remove_indices))

sample_data = full_country_stats[["GDP per capita", "Life satisfaction"]].iloc[keep_indices]
missing_data = full_country_stats[["GDP per capita", 'Life satisfaction']].iloc[remove_indices]

In [None]:
missing_data

In [None]:
sample_data.plot(kind='scatter', x="GDP per capita", y='Life satisfaction', figsize=(5,3))
# set the range of x and y
plt.axis([0, 60000, 0, 10])
position_text = {
    "Hungary": (5000, 1),
    "Korea": (18000, 1.7),
    "France": (29000, 2.4),
    "Australia": (40000, 3.0),
    "United States": (52000, 3.8),
}
for country, pos_text in position_text.items():
    pos_data_x, pos_data_y = sample_data.loc[country]
    country = "U.S." if country == "United States" else country
    plt.annotate(country, xy=(pos_data_x, pos_data_y), xytext=pos_text,
            arrowprops=dict(facecolor='blue', width=0.5, shrink=0.1, headwidth=5))
    plt.plot(pos_data_x, pos_data_y, "ro")
plt.xlabel("GDP per capita (USD)")
# save_fig('money_happy_scatterplot')
plt.show()

In [None]:
# save sample data csv
sample_data.to_csv(os.path.join('datasets', 'lifesat', 'lifesat.csv'))

In [None]:
sample_data.loc[list(position_text.keys())]
# position_text.keys()

In [None]:
# use for smooth range of values for ploting or numerical analysis.
# it also can be used from ploting a continuous line
# np.linspace(0, 60000, 1000)

# Plot line on scattter

In [None]:

sample_data.plot(kind='scatter', x='GDP per capita', y='Life satisfaction', figsize=(5, 3))
plt.xlabel('GDP per capita')
# set the range of x and y
plt.axis([0, 60000, 0, 10])
X = np.linspace(0, 60000, 1000)
plt.plot(X, 2*X/100000, 'r')
plt.text(40000, 2.7, r"$\theta_0 = 0$", fontsize=14, color='r')
plt.text(40000, 1.8, r"$\theta_1 = 2 \times 10^{-5}$", fontsize=14, color="r")

plt.plot(X, 4+(5*X/10**5), 'b')
plt.text(8000, 3.5, r"$\theta_0 = 4$", fontsize=14, color='b')
plt.text(8000, 2.8, r"$\theta_1 = 5 \times 10^{-5}$", fontsize=14, color="b")

plt.plot(X, 8-(5*X/10**5), 'g')
plt.text(6000, 9, r"$\theta_0 = 8$", fontsize=14, color='g')
plt.text(6000, 8.3, r"$\theta_1 = -5 \times 10^{-5}$", fontsize=14, color="g")


plt.show()

Before we can use our model, we need to define the parameter values theta_0 and theta_1. and How can we know which values will make your model perform best? For that we the performance measure.
Either we need to define a ***utility function (fitness function)*** how good the model is or a ***cost function*** that will measure how bad the model is.

Here the **Linear Regression** algorigthm comes in. Now we feed our traing set to the algorithm and it will finds the parameters ***(theta_0, theta_1)*** that make the linear model fit best to our data. and this procedure is also called **Traing** the model

In [None]:
from sklearn import linear_model

lin1 = linear_model.LinearRegression()

X_sample = np.c_[sample_data['GDP per capita']]
y_sample = np.c_[sample_data['Life satisfaction']]
lin1.fit(X_sample, y_sample)

theta_0, theta_1 = lin1.intercept_[0], lin1.coef_[0][0]

theta_0, theta_1

### Visualize the best fit line

In [None]:
sample_data.plot(kind="scatter", x="GDP per capita", y="Life satisfaction", figsize=(5, 3))
plt.xlabel('GDP per capita (USD)')
# set the range of x and y
plt.axis([0, 60000, 0, 10])
X = np.linspace(0, 60000, 100)
plt.plot(X, theta_0+(theta_1*X), "b")
plt.text(5000, 3.1, r"$\theta_0=5.81$", fontsize=14, color='b')
plt.text(5000, 2.2, r"$\theta_1=2.24 \times 10^{-5}$", fontsize=14, color='b')
plt.show()

In [None]:
cyprus_gdp_per_capita = gdp_per_capita.loc['Cyprus']['GDP per capita']
print(cyprus_gdp_per_capita)

cyprus_predicted_life_satisfaction = lin1.predict([[cyprus_gdp_per_capita]])[0][0]
print(cyprus_predicted_life_satisfaction)

canada_gdp_per_capita = gdp_per_capita.loc['Canada']['GDP per capita']
canda_predicted_Life_satisfaction = lin1.predict([[canada_gdp_per_capita]])[0][0]
print(canda_predicted_Life_satisfaction)

In [None]:
sample_data.plot(kind='scatter', x='GDP per capita', y='Life satisfaction', figsize=(5, 3), s=10)
plt.xlabel('GDP per capita (USD)')
X = np.linspace(0, 100000, 1000)
plt.plot(X, theta_0+(theta_1*X), 'b')
# set the range of x and y
plt.axis([0, 100000, 0, 10])

plt.text(5000, 9.5, r"$\theta_0 = 5.81$", fontsize=14, color='b')
plt.text(5000, 8.6, r"$\theta_1 = 2.24 \times 10^{-5}$", fontsize=14, color='b')

plt.plot([cyprus_gdp_per_capita, cyprus_gdp_per_capita], [0, cyprus_predicted_life_satisfaction], 'r--')
plt.plot(cyprus_gdp_per_capita, cyprus_predicted_life_satisfaction, 'ro')
plt.text(25000, 5.0, r"Prediction = 6.31", fontsize=14, color="b")

# plot predicted life satisfaction for canada
plt.plot([canada_gdp_per_capita, canada_gdp_per_capita], [0, canda_predicted_Life_satisfaction], 'r--')
plt.plot(canada_gdp_per_capita, canda_predicted_Life_satisfaction, 'ro')
plt.text(canada_gdp_per_capita, canda_predicted_Life_satisfaction+1, r"Prediction = 6.77", fontsize=10, color='r')


plt.show()

### Use KNeighborsRegressor

In [None]:
portugal_index = sample_data.index.get_loc('Portugal')
slovenia_index = sample_data.index.get_loc('Slovenia')
spain_index = sample_data.index.get_loc('Spain')

test_df = pd.concat([
    sample_data.loc[['Portugal']],
    sample_data.loc[['Slovenia']],
    sample_data.loc[['Spain']]
])
# test_df.set_index("Country", inplace=True)
test_df

In [None]:
lin_reg_full = linear_model.LinearRegression()
Xfull = np.c_[full_country_stats["GDP per capita"]]
yfull = np.c_[full_country_stats["Life satisfaction"]]
lin_reg_full.fit(Xfull, yfull)

t0full, t1full = lin_reg_full.intercept_[0], lin_reg_full.coef_[0][0]

In [None]:
from sklearn import preprocessing
from sklearn import pipeline

full_country_stats.plot(kind='scatter', x='GDP per capita', y='Life satisfaction', figsize=(8, 3))
plt.axis([0, 110000, 0, 10])

poly = preprocessing.PolynomialFeatures(degree=30, include_bias=False)
scaler = preprocessing.StandardScaler()
lin_reg2 = linear_model.LinearRegression()

X=np.linspace(0, 110000, 1000)
pipeline_reg = pipeline.Pipeline([('poly', poly), ('scal', scaler), ('lin', lin_reg2)])
pipeline_reg.fit(Xfull, yfull)
curve = pipeline_reg.predict(X[:, np.newaxis])
plt.plot(X, curve)
plt.xlabel('GDP per capita (USD)')

plt.show()
