In [None]:
# Prediction : Linear Regression 
# Classification : Logistic Regression

# Variables : # Dependent variable(y) : Quantitative variable (continuous)
              # Independent variable(x) : Quantitative
    


In [None]:
# Linear Regression : ease of interpreting results

# Y(actual) = a+bx (simple linear regression)

# Y = A+B1X1+B2X2+......BnXn (multiple linear regression)

# A(ALPHA) : Intercept / Random error : Value OF Y ON X-AXIS
# B(BETA) :regression Coefficient/Slope : one unit change in independent variable will change dependent variable


# 𝑓(𝐱i) = 𝑏₀ + 𝑏₁𝑥₁ + ⋯ + 𝑏i𝑥i = Estimated Regression fuction

# Yi - 𝑓(𝐱i) = residuals

# Regression is about determining the best predicted weights, that is the weights corresponding to the smallest residuals.
# To get the best weights, you usually minimize the sum of squared residuals (SSR) for all observations
# 𝑖 = 1, …, 𝑛: SSR = Σᵢ(𝑦ᵢ - 𝑓(𝐱ᵢ))². This approach is called the method of ordinary least squares.

In [None]:
# Underfitting : occurs when a model can’t accurately capture the dependencies among data, 
#                usually as a consequence of its own simplicity. 
#                It often yields a low 𝑅² with known data and bad generalization capabilities 
#                when applied with new data.

# Overfitting : happens when a model learns both dependencies among data and random fluctuations(변동). 
#               In other words, a model learns the existing data too well. 
#               Complex models, which have many features or terms, are often prone to overfitting. 
#               When applied to known data, such models usually yield high 𝑅². 
#               However, they often don’t generalize well and have significantly lower 𝑅² when used with new data.

In [2]:
# Linear Regression
# Import packages and classes
import numpy as np
from sklearn.linear_model import LinearRegression

# Provide data
x = np.array([5, 15, 25, 35, 45, 55]).reshape((-1, 1))
y = np.array([5, 20, 14, 32, 22, 38])

print(x)
print(y)

[[ 5]
 [15]
 [25]
 [35]
 [45]
 [55]]
[ 5 20 14 32 22 38]


In [5]:
# Create a model and fit it
model = LinearRegression().fit(x, y)

In [6]:
# Get results
r_sq = model.score(x, y)
print('coefficient of determination:', r_sq)

coefficient of determination: 0.715875613747954


In [7]:
print('intercept:', model.intercept_)
print('slope:', model.coef_)

intercept: 5.633333333333329
slope: [0.54]


In [8]:
# Predict response

y_pred = model.predict(x) #(y_pred = model.intercept_ + model.coef_ * x)
print('predicted response:', y_pred, sep='\n')

predicted response:
[ 8.33333333 13.73333333 19.13333333 24.53333333 29.93333333 35.33333333]


In [9]:
x_new = np.arange(5).reshape((-1, 1))
print(x_new)

y_new = model.predict(x_new)
print(y_new)

[[0]
 [1]
 [2]
 [3]
 [4]]
[5.63333333 6.17333333 6.71333333 7.25333333 7.79333333]


In [10]:
# Multiple Linear Regression
import numpy as np
from sklearn.linear_model import LinearRegression

x = [[0, 1], [5, 1], [15, 2], [25, 5], [35, 11], [45, 15], [55, 34], [60, 35]]
y = [4, 5, 20, 14, 32, 22, 38, 43]
x, y = np.array(x), np.array(y)
print(x)
print(y)

[[ 0  1]
 [ 5  1]
 [15  2]
 [25  5]
 [35 11]
 [45 15]
 [55 34]
 [60 35]]
[ 4  5 20 14 32 22 38 43]


In [11]:
model = LinearRegression().fit(x, y)

In [12]:
r_sq = model.score(x, y)
print('coefficient of determination:', r_sq)
print('intercept:', model.intercept_)
print('slope:', model.coef_)

coefficient of determination: 0.8615939258756775
intercept: 5.52257927519819
slope: [0.44706965 0.25502548]


In [13]:
y_pred = model.predict(x) #(y_pred = model.intercept_ + model.coef_ * x)
print('predicted response:', y_pred, sep='\n')

predicted response:
[ 5.77760476  8.012953   12.73867497 17.9744479  23.97529728 29.4660957
 38.78227633 41.27265006]


In [23]:
# Polynomial Regression
# Step 1: Import packages
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

# Step 2a: Provide data
x = [[0, 1], [5, 1], [15, 2], [25, 5], [35, 11], [45, 15], [55, 34], [60, 35]]
y = [4, 5, 20, 14, 32, 22, 38, 43]
x, y = np.array(x), np.array(y)

# Step 2b: Transform input data
x_ = PolynomialFeatures(degree=2, include_bias=False).fit_transform(x)

# Step 3: Create a model and fit it
model = LinearRegression().fit(x_, y)

# Step 4: Get results
r_sq = model.score(x_, y)
intercept, coefficients = model.intercept_, model.coef_

# Step 5: Predict
y_pred = model.predict(x_)

In [24]:
print('coefficient of determination:', r_sq)

coefficient of determination: 0.9453701449127822


In [25]:
print('intercept:', intercept)

intercept: 0.8430556452395734


In [26]:
print('coefficients:', coefficients, sep='\n')

coefficients:
[ 2.44828275  0.16160353 -0.15259677  0.47928683 -0.4641851 ]


In [27]:
print('predicted response:', y_pred, sep='\n')

predicted response:
[ 0.54047408 11.36340283 16.07809622 15.79139    29.73858619 23.50834636
 39.05631386 41.92339046]
