# Machine Learning Study Notes


## Supervised learning

### 1. Basics of Polynomial Regression Fitting

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression #Library for linear regression
from sklearn.preprocessing import PolynomialFeatures #Library for polynomial fitting

In [2]:
train_data = pd.read_csv('data.csv')

#In order for the polynomial transformation to work, we need to convert our independent variables (predictors) into a matrix by using Numpy's reshape function
X = train_data['Var_X'].values.reshape(-1,1)
y = train_data['Var_Y'].values

print(X)

[[-0.33532]
 [ 0.0216 ]
 [-1.19438]
 [-0.65046]
 [-0.28001]
 [ 1.93258]
 [ 1.2262 ]
 [ 0.74727]
 [ 3.32853]
 [ 2.87457]
 [-1.48662]
 [ 0.37629]
 [ 1.43918]
 [ 0.24183]
 [-2.7914 ]
 [ 1.08176]
 [ 2.81555]
 [ 0.54924]
 [ 2.36449]
 [-1.01925]]


In [3]:
#Define the degree of polynomial here.
ploy_feat = PolynomialFeatures(3)

#Fucntion for polynomial transformation
X_poly =  ploy_feat.fit_transform(X)
print(X_poly)

[[ 1.00000000e+00 -3.35320000e-01  1.12439502e-01 -3.77032139e-02]
 [ 1.00000000e+00  2.16000000e-02  4.66560000e-04  1.00776960e-05]
 [ 1.00000000e+00 -1.19438000e+00  1.42654358e+00 -1.70383513e+00]
 [ 1.00000000e+00 -6.50460000e-01  4.23098212e-01 -2.75208463e-01]
 [ 1.00000000e+00 -2.80010000e-01  7.84056001e-02 -2.19543521e-02]
 [ 1.00000000e+00  1.93258000e+00  3.73486546e+00  7.21792628e+00]
 [ 1.00000000e+00  1.22620000e+00  1.50356644e+00  1.84367317e+00]
 [ 1.00000000e+00  7.47270000e-01  5.58412453e-01  4.17284874e-01]
 [ 1.00000000e+00  3.32853000e+00  1.10791120e+01  3.68771565e+01]
 [ 1.00000000e+00  2.87457000e+00  8.26315268e+00  2.37530108e+01]
 [ 1.00000000e+00 -1.48662000e+00  2.21003902e+00 -3.28548821e+00]
 [ 1.00000000e+00  3.76290000e-01  1.41594164e-01  5.32804680e-02]
 [ 1.00000000e+00  1.43918000e+00  2.07123907e+00  2.98088585e+00]
 [ 1.00000000e+00  2.41830000e-01  5.84817489e-02  1.41426413e-02]
 [ 1.00000000e+00 -2.79140000e+00  7.79191396e+00 -2.17503486e

In [4]:
#Make and fit the polynomial regression model
poly_model = LinearRegression(fit_intercept =False).fit(X_poly, y)

### 2. Perform L1 Regularization (Lasso)

In [5]:
from sklearn.linear_model import Lasso #Library for L1 regularization

In [6]:
train_data = pd.read_csv('data2.csv', header= None) #Remember to put header = None for csv without a column header

In [7]:
"""
Column 0 - 5 are the predictors 
Column 6 is the outcome
"""
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6
0,1.25664,2.04978,-6.2364,4.71926,-4.26931,0.2059,12.31798
1,-3.89012,-0.37511,6.14979,4.94585,-3.57844,0.0064,23.67628
2,5.09784,0.9812,-0.29939,5.85805,0.28297,-0.20626,-1.53459
3,0.39034,-3.06861,-5.63488,6.43941,0.39256,-0.07084,-24.6867
4,5.84727,-0.15922,11.41246,7.52165,1.69886,0.29022,17.54122


In [8]:
#Define X and y
X = train_data.iloc[:,0:-1]
y = train_data.iloc[:,-1]

In [9]:
#Apply Lasso regularization
lasso_reg = Lasso(alpha=1) #alpha is the parameter of punishing complexity
lasso_reg.fit(X,y)

#Print out coefficient
reg_coef = lasso_reg.coef_
print(reg_coef)

[ 0.          2.35793224  2.00441646 -0.05511954 -3.92808318  0.        ]


We can see that predictor 0 and predictor 5 have been removed from the model after L1 regularization (to avoid the risk of overfitting).  In comparsion if we just run a linear regression fitting, we would get coefficient for all 7 predictors.  See below:

In [10]:
#Apply linear regression
linear_reg = LinearRegression()
linear_reg.fit(X,y)

#Print out coefficient
reg_coef = linear_reg.coef_
print(reg_coef)

[-6.19918532e-03  2.96325160e+00  1.98199191e+00 -7.86249920e-02
 -3.95818772e+00  9.30786141e+00]
