<a href="https://colab.research.google.com/github/sahdahx/neverstoplearning/blob/master/SKLearn_05_%7C_Simple_Linear_Regression_dengan_Scikit_Learn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **SKLearn 05 | Simple Linear Regression dengan Scikit-Learn | Belajar Machine Learning Dasar, 12 Februari 2024**
[sumber belajar](https://youtu.be/lcjq7-2zMSA?si=uHxZuIaBA01VzT-T)

- Sample dataset
- Visualisasi dataset
- Transformasi dataset
- Training Simple Linear Regression Model
- Visualisasi Simple Linear Regression Model | Penjelasan persamaan garis linear
- Kalkulasi nilai slope
- Kalkukasi nilai intercept
- Prediksi harga pizza dengan Simple Linear Regression Model
- Evaluasi model dengan Coefficient of Determination | R Squared
- Kalkulasi nilai R Squared | Coefficient of Determination

In [None]:
# @title Sample Dataset
import pandas as pd

pizza = {'diameter': [6, 8, 10, 14, 18],
         'harga': [7, 9, 13, 17.5, 18]}

pizza_df = pd.DataFrame(pizza)
pizza_df

Unnamed: 0,diameter,harga
0,6,7.0
1,8,9.0
2,10,13.0
3,14,17.5
4,18,18.0


In [None]:
# @title Harga

from matplotlib import pyplot as plt
pizza_df['harga'].plot(kind='line', figsize=(8, 4), title='harga')
plt.gca().spines[['top', 'right']].set_visible(False)

In [None]:
# @title Diameter

from matplotlib import pyplot as plt
pizza_df['diameter'].plot(kind='line', figsize=(8, 4), title='diameter')
plt.gca().spines[['top', 'right']].set_visible(False)

In [None]:
# @title Diameter VS Harga

from matplotlib import pyplot as plt
pizza_df.plot(kind='scatter', x='diameter', y='harga', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
# @title Diameter

from matplotlib import pyplot as plt
pizza_df['diameter'].plot(kind='hist', bins=20, title='diameter')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
# @title Harga

from matplotlib import pyplot as plt
pizza_df['harga'].plot(kind='hist', bins=20, title='harga')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
# @title Perbandingan Diameter dan Harga Pizza
import matplotlib.pyplot as plt

pizza_df.plot(kind='scatter', x='diameter', y='harga')
plt.title('Perbandingan Diameter dan Harga Pizza')
plt.xlabel('Diameter (inch)')
plt.ylabel('Harga (dollar)')
plt.xlim(0, 25)
plt.ylim(0, 25)
plt.grid(True)
plt.show()

In [None]:
# @title Penyesuaian Dataset
import numpy as np

X = np.array([pizza_df['diameter']])
y = np.array([pizza_df['harga']])

print(f'X: {X}')
print(f'y: {y}')

X: [[ 6  8 10 14 18]]
y: [[ 7.   9.  13.  17.5 18. ]]


In [None]:
X = X.reshape(-1, 1)
X.shape

(5, 1)

In [None]:
X

array([[ 6],
       [ 8],
       [10],
       [14],
       [18]])

In [None]:
# @title Training Simple Linear Regression Model
from sklearn.linear_model import LinearRegression

# Reshape X to (n, 1)
X = np.array(pizza_df['diameter']).reshape(-1, 1)
y = np.array(pizza_df['harga'])

model = LinearRegression()
model.fit(X, y)

In [None]:
# @title Visualisasi Simple Linear Regression Model
import numpy as np

X_vis = np.array([0, 25]).reshape(-1, 1)
y_vis = model.predict(X_vis)

plt.scatter(X, y)
plt.plot(X_vis, y_vis, '-r')

plt.title('Perbandingan Diameter dan Harga Pizza')
plt.xlabel('Diameter (inch)')
plt.ylabel('Harga (dollar)')
plt.xlim(0, 25)
plt.ylim(0, 25)
plt.grid(True)
plt.show()

In [None]:
# @title Formula Linear Regression: y = α + βx

print(f'intercept: {model.intercept_}')
print(f'slope: {model.coef_}')

intercept: 1.965517241379315
slope: [0.9762931]


In [None]:
# @title Mencari Nilai Slope: β = cov(x,y)/var(x)

print(f'X:\n{X}\n')
print(f'X flatten: {X.flatten()}\n')
print(f'y: {y}')

X:
[[ 6]
 [ 8]
 [10]
 [14]
 [18]]

X flatten: [ 6  8 10 14 18]

y: [ 7.   9.  13.  17.5 18. ]


In [None]:
# @title Variance

variance_x = np.var(X.flatten(), ddof=1)
print(f'variance: {variance_x}')

variance: 23.2


In [None]:
# @title Covariance

np.cov(X.flatten(), y)

array([[23.2 , 22.65],
       [22.65, 24.3 ]])

In [None]:
covariance_xy = np.cov(X.flatten(), y) [0][1]
print(f'covariance: {covariance_xy}')

covariance: 22.650000000000002


In [None]:
# @title Slope

slope = covariance_xy / variance_x
print(f'slope: {slope}')

slope: 0.976293103448276


In [None]:
# @title Mencari Nilai Intercept

intercept = np.mean(y) - slope * np.mean(X)
print(f'intercept: {intercept}')

intercept: 1.9655172413793096


In [None]:
# @title Prediksi Harga Pizza

diameter_pizza = np.array([12, 20, 23]).reshape(-1, 1)
diameter_pizza

array([[12],
       [20],
       [23]])

In [None]:
prediksi_harga = model.predict(diameter_pizza)
prediksi_harga

array([13.68103448, 21.49137931, 24.42025862])

In [None]:
for diameter, harga in zip(diameter_pizza, prediksi_harga):
  print(f'Diameter:{diameter} prediksi harga: {harga}')

Diameter:[12] prediksi harga: 13.681034482758621
Diameter:[20] prediksi harga: 21.491379310344826
Diameter:[23] prediksi harga: 24.42025862068965


In [None]:
# @title Training & Testing Dataset

X_train = np.array([6, 8, 10, 14, 18]).reshape(-1, 1)
y_train = np.array([7, 9, 13, 17.5, 18])

X_test = np.array([8, 9, 11, 16, 12]).reshape(-1, 1)
y_test = np.array([11, 8.5, 15, 18, 11])

In [None]:
# @title Training Simple Linear Regression Model

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# @title Evaluasi Linear Regression Model dengan Coefficient of Determination atau R-squared
from sklearn.metrics import r2_score

r_squared = r2_score(y_test, y_pred)
print(r_squared)

# semakin R-squared mendekati 1, maka semakin baik (probabilitasnya tinggi)
# semakin R-squared mendekati 0, maka semakin buruk
# jika kualitasnya buruk sekali, maka R-squared akan bernilai negatif atau di bawah 0

0.6620052929422553


In [None]:
# @title Mencari nilai R-squared (R^2) dengan SSres dan SStot

# mencari nilai SSres
ss_res = sum([(y_i - model.predict(x_i.reshape(-1, 1))[0])**2
              for x_i, y_i in zip(X_test, y_test)])
print(ss_res)

# mencari nilai SStot
mean_y = np.mean(y_test)
ss_tot = sum([(y_i - mean_y)**2 for y_i in y_test])
print(ss_tot)

# mencari nilai R-squared
r_squared = 1 - (ss_res / ss_tot)
print(r_squared)

19.1980993608799
56.8
0.6620052929422553
