In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_log_error

df = pd.read_csv("tmdb-box-office-data/train_processed.csv")

print("Data shape:", df.shape)
df.head()


In [None]:

TARGET_COL = "revenue_log"


numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()


drop_cols = [TARGET_COL]
for col in ["revenue", "id"]:
    if col in numeric_cols:
        drop_cols.append(col)

feature_cols = [c for c in numeric_cols if c not in drop_cols]

print("Number of feature columns:", len(feature_cols))
print("Example feature columns:", feature_cols[:10])

X = df[feature_cols].copy()
y_log = df[TARGET_COL].copy()   # log(revenue + 1)


In [None]:

X_train, X_val, y_log_train, y_log_val = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)

print("Train shape:", X_train.shape)
print("Val shape:", X_val.shape)


In [None]:


linreg_pipeline = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("model", LinearRegression())
    ]
)

linreg_pipeline.fit(X_train, y_log_train)


In [None]:

y_log_pred_val = linreg_pipeline.predict(X_val)

y_val_revenue = np.expm1(y_log_val.values)      # true revenue
y_pred_revenue = np.expm1(y_log_pred_val)       # predicted revenue

y_val_revenue = np.maximum(0, y_val_revenue)
y_pred_revenue = np.maximum(0, y_pred_revenue)

rmsle = np.sqrt(mean_squared_log_error(y_val_revenue, y_pred_revenue))

print(f"Baseline Linear Regression RMSLE (val): {rmsle:.4f}")
