# 08 — Intro to Machine Learning (Scikit-learn)

## Learning goals
- Understand train/test split
- Train a basic regression model
- Evaluate with MAE / R²

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

# Synthetic study-hours dataset
df = pd.DataFrame({
    "hours": [1,2,3,4,5,6,7,8,9,10],
    "score": [50,55,60,63,68,72,78,84,88,93]
})

X = df[["hours"]]
y = df["score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, pred))
print("R²:", r2_score(y_test, pred))

MAE: 0.7508250825082546
R²: 0.9945611117397111


In [2]:
# Make a prediction
hours = [[7.5]]
predicted_score = model.predict(hours)[0]
print(f"Predicted score for 7.5 study hours: {predicted_score:.2f}")

Predicted score for 7.5 study hours: 80.81




In [None]:
# Practice task:
# Try a different test_size and compare MAE/R².

In [3]:
# Step 1: Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

# Step 2: Create a sample dataset
# Let's predict 'Score' based on 'Hours_Studied'
np.random.seed(42)
df = pd.DataFrame({
    "Hours_Studied": np.random.randint(1, 11, 20),  # 20 students, 1-10 hours
    "Score": np.random.randint(50, 101, 20)         # Random scores between 50-100
})

# Step 3: Features and target
X = df[["Hours_Studied"]]
y = df["Score"]

# Step 4: Try different test sizes
for test_size in [0.2, 0.3, 0.4]:
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    # Train a Linear Regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = model.predict(X_test)
    
    # Evaluate
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Test size: {test_size}")
    print(f"MAE: {mae:.2f}")
    print(f"R²: {r2:.2f}")
    print("-" * 30)


Test size: 0.2
MAE: 15.00
R²: -0.13
------------------------------
Test size: 0.3
MAE: 13.04
R²: -0.19
------------------------------
Test size: 0.4
MAE: 12.41
R²: -0.15
------------------------------
