In [1]:
import pandas as pd
import numpy as np

Kita ingin membuat persamaan linear sebagai berikut

$$
\text{y} = \beta_{0}+\beta_{1} \cdot \text{x1} + \beta_{2} \cdot \text{x2}  + e
$$

In [4]:
data = {'x1': [1, 2, 3, 4, 5],
        'x2': [2, 3, 4, 5, 6],
        'y': ['A', 'B', 'C', 'B', 'A']}
df = pd.DataFrame(data)
df

Unnamed: 0,x1,x2,y
0,1,2,A
1,2,3,B
2,3,4,C
3,4,5,B
4,5,6,A


Persamaan

$$
\begin{align*}
A &=\beta_{0}+ \beta_{1} \cdot 2+ \beta_{2} \cdot 1 + e_{1}\\
B &=\beta_{0}+ \beta_{1} \cdot 3 + \beta_{2} \cdot 2 + e_{2} \\
&\vdots
\end{align*}
$$

Dinotasikan dalam variable
$$
y = X \cdot \theta + e
$$

Untuk menyelesaikan ini, kita membuat MSE
$$
\begin{align*}
\text{MSE} &= \cfrac{1}{n} \sum_{i=1}^{n} e_{i}^2\\
\text{MSE} &= \cfrac{1}{n} (y-X\cdot\theta)^{2}
\end{align*}
$$

jika ingin mencari parameter model, $\theta agar MSE minimum maka perlu diturunkan

$$
\begin{align*}
\cfrac{\partial \text{MSE}}{\partial \theta} &= 0 \\
0 &= \cfrac{\partial}{\partial \theta} \cfrac{1}{n} (y-X\cdot\theta)^{2} \\
&= \cfrac{2}{n} (-X^{T}) (y - X \cdot \theta) \\
&= X^{T}y - (X^{T}X)\theta \\
\theta_{\text{optimum}} &= (X^{T}X)^{-1} X^{T} y
\end{align*}
$$

In [7]:
df.insert(0, 'x0', 1)
df

Unnamed: 0,x0,x1,x2,y
0,1,1,2,A
1,1,2,3,B
2,1,3,4,C
3,1,4,5,B
4,1,5,6,A


In [8]:
data = pd.get_dummies(df, columns=['y'])
data

Unnamed: 0,x0,x1,x2,y_A,y_B,y_C
0,1,1,2,1,0,0
1,1,2,3,0,1,0
2,1,3,4,0,0,1
3,1,4,5,0,1,0
4,1,5,6,1,0,0


In [9]:
#Split data
# Ambil 80% dari jumlah baris dataframe
persentase = 0.8
jumlah_baris = int(persentase * len(data))
data_train = data.sample(n=jumlah_baris, random_state=123)
data_train

Unnamed: 0,x0,x1,x2,y_A,y_B,y_C
1,1,2,3,0,1,0
3,1,4,5,0,1,0
4,1,5,6,1,0,0
0,1,1,2,1,0,0


In [10]:
X_train = data_train.iloc[:, 0:(len(data.columns) - df['y'].nunique())]
X_train

Unnamed: 0,x0,x1,x2
1,1,2,3
3,1,4,5
4,1,5,6
0,1,1,2


In [11]:
Y_train = data_train.iloc[:, (len(data.columns) - df['y'].nunique()):]
Y_train

Unnamed: 0,y_A,y_B,y_C
1,0,1,0
3,0,1,0
4,1,0,0
0,1,0,0


In [12]:
data_test = df.drop(data_train.index)
data_test

Unnamed: 0,x0,x1,x2,y
2,1,3,4,C


In [13]:
X_test = data_test.iloc[:, 0:(len(data.columns) - df['y'].nunique())]
X_test

Unnamed: 0,x0,x1,x2
2,1,3,4


In [14]:
Y_test = data_test.iloc[:, (len(data.columns) - df['y'].nunique()):]
Y_test

Unnamed: 0,y
2,C


Estimasi theta
$$
\begin{align*}
\theta_{\text{optimum}} &= (X^{T}X)^{-1} X^{T} y
\end{align*}
$$

In [15]:
#Estimasi theta
theta = (pd.DataFrame(np.linalg.pinv(((X_train.T).dot(X_train)).values), ((X_train.T).dot(X_train)).columns, ((X_train.T).dot(X_train)).index).dot((X_train.T).dot(X_train))).dot((X_train.T).dot(Y_train))
theta

Unnamed: 0,y_A,y_B,y_C
x0,2.0,2.0,0.0
x1,6.0,6.0,0.0
x2,8.0,8.0,0.0


In [16]:
yhat = X_test.dot(theta)
yhat

Unnamed: 0,y_A,y_B,y_C
2,52.0,52.0,0.0


In [17]:
# Mengidentifikasi kolom dengan nilai maksimum dalam setiap baris
max_columns = yhat.idxmax(axis=1)

# Membuat DataFrame baru yang berisi 1 hanya pada kolom dengan nilai maksimum dalam setiap baris
dummy_df = pd.DataFrame(0, columns=yhat.columns, index=range(len(yhat)))
for i, col in enumerate(max_columns):
    dummy_df.at[i, col] = 1

# Menampilkan hasil DataFrame dummy
print(dummy_df)

   y_A  y_B  y_C
0    1    0    0


In [18]:
# Mengidentifikasi kolom dengan nilai maksimum dalam setiap baris
max_columns = dummy_df.idxmax(axis=1)

# Membuat DataFrame baru yang berisi 1 hanya pada kolom dengan nilai maksimum dalam setiap baris
dummy_df = pd.DataFrame(0, columns=yhat.columns, index=range(len(yhat)))
for i, col in enumerate(max_columns):
    dummy_df.at[i, col] = 1

# Menggabungkan kolom dummy menjadi satu kolom kategori 'A', 'B', atau 'C'
dummy_df['Category'] = dummy_df.apply(lambda row: 'A' if row['y_A'] == 1 else ('B' if row['y_B'] == 1 else 'C'), axis=1)

# Menampilkan hasil
print(dummy_df)

   y_A  y_B  y_C Category
0    1    0    0        A


In [19]:
Y_test['y']

2    C
Name: y, dtype: object

In [23]:
# Hitung frekuensi kemunculan setiap nilai dalam data_test dan dummy_df
frekuensi_data_test = data_test['y'].value_counts()
frekuensi_dummy_df = dummy_df['Category'].value_counts()

# Ambil nilai yang ada dalam keduanya dan jumlahkan frekuensinya
kemiripan = (frekuensi_data_test.index & frekuensi_dummy_df.index).tolist()
jumlah_kemiripan = sum(frekuensi_data_test[k] for k in kemiripan)

#Hitung persentase yang sesuai
jumlah_kemiripan*100/len(data_test)

  kemiripan = (frekuensi_data_test.index & frekuensi_dummy_df.index).tolist()


0.0

Contoh lain dari data iris

In [24]:
df = pd.read_csv('iris_data.csv')
df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [25]:
# Buat dummy
df.insert(0, 'x0', 1)
data = pd.get_dummies(df, columns=['variety'])
data

Unnamed: 0,x0,sepal.length,sepal.width,petal.length,petal.width,variety_Setosa,variety_Versicolor,variety_Virginica
0,1,5.1,3.5,1.4,0.2,1,0,0
1,1,4.9,3.0,1.4,0.2,1,0,0
2,1,4.7,3.2,1.3,0.2,1,0,0
3,1,4.6,3.1,1.5,0.2,1,0,0
4,1,5.0,3.6,1.4,0.2,1,0,0
...,...,...,...,...,...,...,...,...
145,1,6.7,3.0,5.2,2.3,0,0,1
146,1,6.3,2.5,5.0,1.9,0,0,1
147,1,6.5,3.0,5.2,2.0,0,0,1
148,1,6.2,3.4,5.4,2.3,0,0,1


In [26]:
# Split data, ambil 80% dari jumlah baris dataframe
persentase = 0.8
jumlah_baris = int(persentase * len(data))
data_train = data.sample(n=jumlah_baris, random_state=123)
data_test = df.drop(data_train.index)
X_train = data_train.iloc[:, 0:(len(data.columns) - df['variety'].nunique())]
Y_train = data_train.iloc[:, (len(data.columns) - df['variety'].nunique()):]
X_test = data_test.iloc[:, 0:(len(data.columns) - df['variety'].nunique())]
Y_test = data_test.iloc[:, (len(data.columns) - df['variety'].nunique()):]

In [27]:
beta = (pd.DataFrame(np.linalg.pinv(((X_train.T).dot(X_train)).values), ((X_train.T).dot(X_train)).columns, ((X_train.T).dot(X_train)).index).dot((X_train.T).dot(X_train))).dot((X_train.T).dot(Y_train))
beta

Unnamed: 0,variety_Setosa,variety_Versicolor,variety_Virginica
x0,44.0,39.0,37.0
sepal.length,220.6,234.3,246.1
sepal.width,150.7,108.6,109.6
petal.length,64.6,166.1,207.9
petal.width,11.1,51.8,74.5


In [28]:
yhat = X_test.dot(beta)
yhat

Unnamed: 0,variety_Setosa,variety_Versicolor,variety_Virginica
2,1649.26,1714.02,1829.56
17,1790.28,1862.11,1989.12
32,1907.0,1956.95,2085.38
39,1780.56,1862.68,1991.5
47,1633.66,1707.2,1825.74
49,1736.97,1811.78,1935.14
55,2028.51,2493.38,2779.05
57,1710.9,2047.64,2266.5
66,2038.81,2502.03,2791.26
67,2006.33,2423.97,2687.19


In [29]:
# Mengidentifikasi kolom dengan nilai maksimum dalam setiap baris
max_columns = yhat.idxmax(axis=1)

# Membuat DataFrame baru yang berisi 1 hanya pada kolom dengan nilai maksimum dalam setiap baris
dummy_df = pd.DataFrame(0, columns=yhat.columns, index=range(len(yhat)))
for i, col in enumerate(max_columns):
    dummy_df.at[i, col] = 1

# Menampilkan hasil DataFrame dummy
print(dummy_df)

    variety_Setosa  variety_Versicolor  variety_Virginica
0                0                   0                  1
1                0                   0                  1
2                0                   0                  1
3                0                   0                  1
4                0                   0                  1
5                0                   0                  1
6                0                   0                  1
7                0                   0                  1
8                0                   0                  1
9                0                   0                  1
10               0                   0                  1
11               0                   0                  1
12               0                   0                  1
13               0                   0                  1
14               0                   0                  1
15               0                   0                  1
16            

In [30]:
# Mengidentifikasi kolom dengan nilai maksimum dalam setiap baris
max_columns = dummy_df.idxmax(axis=1)

# Membuat DataFrame baru yang berisi 1 hanya pada kolom dengan nilai maksimum dalam setiap baris
dummy_df = pd.DataFrame(0, columns=yhat.columns, index=range(len(yhat)))
for i, col in enumerate(max_columns):
    dummy_df.at[i, col] = 1

# Menggabungkan kolom dummy menjadi satu kolom kategori 'A', 'B', atau 'C'
dummy_df['Category'] = dummy_df.apply(lambda row: 'Setosa' if row['variety_Setosa'] == 1 else ('Virginica' if row['variety_Virginica'] == 1 else 'Versicolor'), axis=1)

# Menampilkan hasil
print(dummy_df)

    variety_Setosa  variety_Versicolor  variety_Virginica   Category
0                0                   0                  1  Virginica
1                0                   0                  1  Virginica
2                0                   0                  1  Virginica
3                0                   0                  1  Virginica
4                0                   0                  1  Virginica
5                0                   0                  1  Virginica
6                0                   0                  1  Virginica
7                0                   0                  1  Virginica
8                0                   0                  1  Virginica
9                0                   0                  1  Virginica
10               0                   0                  1  Virginica
11               0                   0                  1  Virginica
12               0                   0                  1  Virginica
13               0                

In [32]:
# Hitung frekuensi kemunculan setiap nilai dalam data_test dan dummy_df
frekuensi_data_test = data_test['variety'].value_counts()
frekuensi_dummy_df = dummy_df['Category'].value_counts()

# Ambil nilai yang ada dalam keduanya dan jumlahkan frekuensinya
kemiripan = (frekuensi_data_test.index & frekuensi_dummy_df.index).tolist()
jumlah_kemiripan = sum(frekuensi_data_test[k] for k in kemiripan)

#Hitung persentase benar
jumlah_kemiripan*100/len(data_test)

  kemiripan = (frekuensi_data_test.index & frekuensi_dummy_df.index).tolist()


43.333333333333336