In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction

* Hello! We will practice about Linear Regression for myself-improvement. I hope this notebook will be useful to you. Lets get start :)

# Contents
1. [Load and Check Data](#1)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
from collections import Counter

<a id = "1"></a><br>
## Load and Check Data

In [None]:
y_2018 = pd.read_csv("../input/world-happiness/2018.csv")
y_2019 = pd.read_csv("../input/world-happiness/2019.csv")

data = pd.concat([y_2018,y_2019], sort = False)
data

In [None]:
data.isnull().sum()

### Variable Description

1. Overall rank: Overall rank: Ranking of countries by happiness level
1. Country or region: Country or region names
1. Score: Happiness scores
1. GDP per capita: Value representing the country's income and expense levels
1. Social support
1. Healthy life expectancy
1. Freedom to make life choices
1. Generosity
1. Perceptions of corruption 

In [None]:
data.describe().T #statistical information about the data set 

In [None]:
data.info() 

* Lets change the column names for convenience.

In [None]:
data.rename(columns={
    "Overall rank": "rank",
    "Country or region": "country",
    "Score": "score",
    "GDP per capita": "gdp",
    "Social support": "social",
    "Healthy life expectancy": "healthy",
    "Freedom to make life choices": "freedom",
    "Generosity": "generosity",
    "Perceptions of corruption": "corruption"
},inplace = True)
del data["rank"]

### Missing Value

In [None]:
data.columns[data.isnull().any()]

In [None]:
data.isnull().sum()

In [None]:
data[data["corruption"].isnull()]

In [None]:
avg_data_corruption = data[data["score"] > 6.774].mean().corruption
data.loc[data["corruption"].isnull(),["corruption"]] = avg_data_corruption
data[data["corruption"].isnull()]

### Data Preparation

In [None]:
df = data.copy()
df = df.select_dtypes(include=["float64","int64"])
df.head()

In [None]:
column_list = ["score","gdp","social","healthy","freedom","generosity","corruption"]

In [None]:
column_list = ["score","gdp","social","healthy","freedom","generosity","corruption"]
sns.heatmap(df[column_list].corr(), annot = True, fmt = ".2f") #annot=True dersek minik karelerin içinde coorelation skorlarını da görmüş oluruz (daha kolay anlayabilmek için)
plt.show()

In [None]:
g = sns.factorplot(x = "score", y = "gdp", data = df, kind = "bar", size = 5)
g.set_ylabels("GDP per capita")
plt.show()

In [None]:
for col in column_list:
    sns.boxplot(x = df[col])
    plt.xlabel(col)
    plt.show()

* We observed outlier detection with boxplot in corruption and social features. But we can observed this features with outlier detection.  

In [None]:
def detect_outliers(df,features):
    outlier_indices = []
    
    for c in features:
        # 1st quartile
        Q1 = np.percentile(df[c],25)
        # 3rd quartile
        Q3 = np.percentile(df[c],75)
        # IQR
        IQR = Q3 - Q1
        # Outlier step
        outlier_step = IQR * 1.5
        # detect outlier and their indeces
        outlier_list_col = df[(df[c] < Q1 - outlier_step) | (df[c] > Q3 + outlier_step)].index
        # store indeces
        outlier_indices.extend(outlier_list_col)
    
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i, v in outlier_indices.items() if v > 2)
    
    return multiple_outliers

Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

In [None]:
lower_bound = Q1 - 1.5*IQR
upper_bound = Q3 + 1.5*IQR
print("lower bound is" + str(lower_bound))
print("upper bound is" + str(upper_bound))
print("Q1: ", Q1)
print("Q3: ", Q3)

In [None]:
df.loc[detect_outliers(df,["score","gdp","social","healthy","freedom","generosity","corruption"])]

In [None]:
# for corruption
df_table = df["corruption"]

Q1 = df_table.quantile(0.25)
Q3 = df_table.quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5*IQR
upper_bound = Q3 + 1.5*IQR
print("lower bound is " + str(lower_bound))
print("upper bound is " + str(upper_bound))
print("Q1: ", Q1)
print("Q3: ", Q3)

In [None]:
outliers_vector = (df_table < (lower_bound)) | (df_table > (upper_bound))
outliers_vector

In [None]:
outliers_vector = df_table[outliers_vector]
outliers_vector.index.values

In [None]:
df_table = data.copy()
df_table["corruption"].iloc[outliers_vector.index.values] = df_table["corruption"].mean()
df_table["corruption"].iloc[outliers_vector.index.values]


In [None]:
data = df_table

# Simple Linear Regression

### score -- gdp

In [None]:
sns.jointplot(x = "gdp", y = "score", data = df_table, kind = "reg")
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression

X = data[["gdp"]]
X.head

In [None]:
y = data[["score"]]
y.head

In [None]:
reg = LinearRegression()
model = reg.fit(X,y)
print("intercept: ", model.intercept_)
print("coef: ", model.coef_)
print("rcore. ", model.score(X,y))

* gdp feature used here describes 63% of the data.

In [None]:
# prediction
plt.figure(figsize = (10,8))
g = sns.regplot(x = data["gdp"], y = data["score"], ci = None, scatter_kws = {'color':'r','s':9})
g.set_title("Model Equation")
g.set_xlabel("gdp")
g.set_ylabel("score")
plt.show()

* If gdp score is 1.50 , happines score is 6.74

In [None]:
model.predict([[1.50]])

In [None]:
gdb_list = [[0.25],[0.50],[0.75],[1.00],[1.25],[1.50]]
model.predict(gdb_list)
for g in gdb_list:
    print("The happiness value of the country with a gdp value of ",g,": ",model.predict([g]))

### score -- social

In [None]:
sns.jointplot(x = "social", y = "score", data = df_table, kind = "reg")
plt.show()

* Let's create a class and make the job easier.

In [None]:
def linear_reg(col,text,prdctn):
    
    sns.jointplot(x=col,y="score",data=df_table,kind="reg")
    plt.show()
    
    X = data[[col]]
    y = data[["score"]]
    reg = LinearRegression()
    model = reg.fit(X,y)
    
    # prediction
    plt.figure(figsize=(12,6))
    g = sns.regplot(x=data[col],y=data["score"],ci=None,scatter_kws = {'color':'r','s':9})
    g.set_title("Model Equation")
    g.set_ylabel("score")
    g.set_xlabel(col)
    plt.show()
    
    print(text,": ", model.predict([[prdctn]]))

In [None]:
linear_reg("social","The happiness value of the country whose sociability value is 2:",2)

### score -- healthy

In [None]:
linear_reg("healthy","The happiness value of the country whose healthiest value is 1.20:",1.20)

### score -- freedom

In [None]:
linear_reg("freedom","The happiness value of the country whose freedom value is 1.20:",1.20)

# Multiple Linear Regression

In [None]:
import statsmodels.api as sms

X = df.drop("score", axis = 1)
y = df["score"]

# OLS (dependent,independent)
lm = sms.OLS(y,X)
model = lm.fit()
model.summary()

* R-squared   :   Percentages of independent variables that explain the change in dependent variables.
* F-statistic :   Expresses the significance of the model.
* Coef        :   Refers to coefficients.
* Std Err     :   Standard errors.

#### Create model with sckit learn

In [None]:
lm = LinearRegression()
model = lm.fit(X,y)
print("constant: ", model.intercept_)
print("coefficient: ", model.coef_)

In [None]:
# PREDICTION
# Score = 0.929921*gdp + 1.06504217*social + 0.94321492*healthy + 1.40426054*freedom + 0.52070628*generosity + 0.88114008*corruption

new_data = [[1],[2],[1.25],[1.75],[1.50],[0.75]]
new_data = pd.DataFrame(new_data).T
new_data

In [None]:
model.predict(new_data)

In [None]:
# calculating the amount of error

from sklearn.metrics import mean_squared_error

MSE = mean_squared_error(y,model.predict(X))
RMSE = np.sqrt(MSE)

print("MSE: ", MSE)
print("RMSE: ", RMSE)

# Simple Linear & Multiple Linear Regression - Model Tuning

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop("score", axis = 1)
y = df["score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train.head()

In [None]:
X_test.head()

In [None]:
y_train.head()

In [None]:
y_test.head()

In [None]:
lm = LinearRegression()
lm.fit(X_train, y_train)
print("Training error: ", np.sqrt(mean_squared_error(y_train, model.predict(X_train))))
print("Test Error: ", np.sqrt(mean_squared_error(y_test, model.predict(X_test))))

* Every time we change the random_state value we defined at first, a different result is returned. We need to find out which of these returns the best result. For this we need to do the following.

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_score(model, X_train, y_train, cv = 10, scoring = "neg_mean_squared_error")

In [None]:
cvs_avg_mse = np.mean(-cross_val_score(model, X_train, y_train, cv = 20, scoring = "neg_mean_squared_error"))
cvs_avg_rmse = np.sqrt(cvs_avg_mse)

print("Cross Val Score MSE : ",cvs_avg_mse)
print("Cross Val Score RMSE : ",cvs_avg_rmse)

# Ridge Regression
<br>
* The aim is to find the coefficients that minimize the sum of error squares by applying a penalty to these coefficients.
<br>
* It is resistant to over learning.
* It is biased but its variance is low.
* It is better than OLS when there are too many parameters.
* Builds a model with all variables. It does not exclude the unrelated variables from the model, it approximates its coefficients to zero.


In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.linear_model import RidgeCV

In [None]:
X = df.drop("score", axis = 1)
y = df["score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

ridge_model = Ridge(alpha = 0.1).fit(X_train, y_train)
ridge_model

In [None]:
ridge_model.coef_

In [None]:
ridge_model.intercept_

In [None]:
lambdas = 10**np.linspace(10,-2,100)*0.5 # Creates random numbers
ridge_model =  Ridge()
coefs = []

for i in lambdas:
    ridge_model.set_params(alpha=i)
    ridge_model.fit(X_train,y_train)
    coefs.append(ridge_model.coef_)
    
ax = plt.gca()
ax.plot(lambdas, coefs)
ax.set_xscale("log")

### Ridge Regression - Prediction

In [None]:
ridge_model = Ridge().fit(X_train,y_train)

y_pred = ridge_model.predict(X_train)

print("predict: ", y_pred[0:10])
print("real: ", y_train[0:10].values)

In [None]:
RMSE = np.mean(mean_squared_error(y_train,y_pred))
print("train error: ", RMSE)

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
Verified_RMSE = np.sqrt(np.mean(-cross_val_score(ridge_model, X_train, y_train, cv=20, scoring="neg_mean_squared_error")))
print("Verified_RMSE: ", Verified_RMSE)

In [None]:
#test error
y_pred = ridge_model.predict(X_test)
RMSE = np.mean(mean_squared_error(y_test, y_pred))
print("test error: ", RMSE)

### Ridge Model -- Model Tuning

In [None]:
ridge_model = Ridge(10).fit(X_train, y_train)
y_pred = ridge_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
ridge_model = Ridge(30).fit(X_train, y_train)
y_pred = ridge_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
ridge_model = Ridge(90).fit(X_train, y_train)
y_pred = ridge_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

* We can find out which value will work better by trial and error. But with the method we will use below, we can find the most appropriate value more easily and quickly.

In [None]:
lambdas1 = 10**np.linspace(10,-2,100)
lambdas2 = np.random.randint(0,1000,100)

ridgeCV = RidgeCV(alphas = lambdas1,scoring = "neg_mean_squared_error", cv=10, normalize=True)
ridgeCV.fit(X_train,y_train)

In [None]:
ridgeCV.alpha_

In [None]:
# final model
ridge_tuned = Ridge(alpha = ridgeCV.alpha_).fit(X_train, y_train)
y_pred = ridge_tuned.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
# for lambdas2
ridgeCV = RidgeCV(alphas = lambdas2, scoring = "neg_mean_squared_error", cv = 10, normalize = True)
ridgeCV.fit(X_train, y_train)
ridge_tuned = Ridge(alpha = ridgeCV.alpha_).fit(X_train, y_train)
y_pred = ridge_tuned.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

# Lasso Regression

# Lasso Regression -- Model

In [None]:
# Required Libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge,Lasso
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import model_selection
from sklearn.linear_model import RidgeCV, LassoCV

In [None]:
x = df.drop("score", axis = 1)
y = df["score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)
lasso_model = Lasso().fit(X_train, y_train)

In [None]:
print("intercept: ", lasso_model.intercept_)
print("coef: ", lasso_model.coef_)

In [None]:
# coefficients for different lambda values

alphas = np.random.randint(0,10000,10)
lasso = Lasso()
coefs = []

for a in alphas:
    lasso.set_params(alpha=a)
    lasso.fit(X_train,y_train)
    coefs.append(lasso.coef_)

In [None]:
ax = plt.gca()
ax.plot(alphas,coefs)
ax.set_xscale("log")

### Lasso Regression - Prediction 

In [None]:
lasso_model

In [None]:
lasso_model.predict(X_train)[0:5]

In [None]:
lasso_model.predict(X_test)[0:5]

In [None]:
y_pred = lasso_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
r2_score(y_test,y_pred)

### Lasso Regression - Model Tuning

In [None]:
lasso_cv_model = LassoCV(cv = 10, max_iter = 100000).fit(X_train, y_train)
lasso_cv_model

In [None]:
lasso_cv_model.alpha_

In [None]:
lasso_tuned = Lasso().set_params(alpha= lasso_cv_model.alpha_).fit(X_train,y_train)
y_pred = lasso_tuned.predict(X_test)
np.sqrt(mean_squared_error(y_test,y_pred))

# ElasticNet Regression

In [None]:
# Required Libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge,Lasso,ElasticNet
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import model_selection
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV

In [None]:
X = df.drop("score",axis=1)
y = df["score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

enet_model = ElasticNet().fit(X_train,y_train)

In [None]:
enet_model.coef_

In [None]:
enet_model.intercept_

In [None]:
#prediction
enet_model.predict(X_train)[0:10]

In [None]:
enet_model.predict(X_test)[0:10]

In [None]:
y_pred = enet_model.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
r2_score(y_test, y_pred)

### Lasso Regression Model Tuning

In [None]:
from sklearn.linear_model import ElasticNetCV

In [None]:
enet_cv_model = ElasticNetCV(cv = 10, random_state = 0).fit(X_train, y_train)

In [None]:
enet_cv_model.alpha_

In [None]:
enet_cv_model

In [None]:
enet_tuned = ElasticNet(alpha = enet_cv_model.alpha_).fit(X_train, y_train)

In [None]:
y_pred = enet_tuned.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))