# Tugas 1: Multiple Linear Regression

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline

data = pd.read_csv('insurance.csv')

### 1. Identifikasi variabel-variabel yang akan digunakan sebagai variabel bebas (fitur) dan variabel target (biaya medis personal).

In [24]:
X = data.drop(columns=['charges'])
y = data['charges']
print(X.head())

   age     sex     bmi  children smoker     region
0   19  female  27.900         0    yes  southwest
1   18    male  33.770         1     no  southeast
2   28    male  33.000         3     no  southeast
3   33    male  22.705         0     no  northwest
4   32    male  28.880         0     no  northwest


### 2. Bagi dataset menjadi data latih (train) dan data uji (test) dengan proporsi yang sesuai.

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [26]:
categorical_cols = ['sex', 'smoker', 'region']
numerical_cols = ['age', 'bmi', 'children']

### 3. Lakukan feature scaling jika diperlukan.

In [27]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first'), categorical_cols)
    ])

### 4. Buat model multiple linear regression menggunakan Scikit-Learn.

In [28]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [29]:
print(X_train.head())
print(X_test.head())

      age     sex    bmi  children smoker     region
560    46  female  19.95         2     no  northwest
1285   47  female  24.32         0     no  northeast
1142   52  female  24.86         0     no  southeast
969    39  female  34.32         5     no  southeast
486    54  female  21.47         3     no  northwest
      age     sex     bmi  children smoker     region
764    45  female  25.175         2     no  northeast
887    36  female  30.020         0     no  northwest
890    64  female  26.885         0    yes  northwest
1293   46    male  25.745         3     no  northwest
259    19    male  31.920         0    yes  northwest


### 5. Latih model pada data latih dan lakukan prediksi pada data uji.

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(y_pred[:5])

[ 8969.55027444  7068.74744287 36858.41091155  9454.67850053
 26973.17345656]


### 6. Evaluasi model dengan menghitung metrik seperti R-squared, MSE, dan MAE. Tampilkan hasil evaluasi.

In [31]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared: {r2}")

Mean Squared Error (MSE): 33596915.85136148
Mean Absolute Error (MAE): 4181.194473753652
R-squared: 0.7835929767120722
