# 01 – EDA & Baseline

This notebook gives you a quick start:
1. Load data
2. Basic cleaning & EDA
3. Simple baseline model (Linear Regression)
4. Save a submission file


In [None]:
# Imports
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from src.data_prep import load_raw, basic_clean, feature_engineer
from src.metrics import rmse

# Load
train, test = load_raw()

# Peek
display(train.head())
train.info()
train.describe().T


In [None]:
# Clean + Feature Engineering
train_clean = feature_engineer(basic_clean(train))
test_clean  = feature_engineer(basic_clean(test))

# Select features (very simple baseline)
features = ['Item_Weight','Item_Visibility','Item_MRP','Outlet_Establishment_Year','Outlet_Age','Visibility_Sq','MRP_Log']
features = [f for f in features if f in train_clean.columns]

X = train_clean[features].copy()
y = train_clean['Item_Outlet_Sales']

# Simple train/val split
X_tr, X_va, y_tr, y_va = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_tr, y_tr)
pred_va = model.predict(X_va)

print('Validation RMSE:', rmse(y_va, pred_va))


In [None]:
# Train on full data and predict test
X_full = train_clean[features]
y_full = train_clean['Item_Outlet_Sales']
model.fit(X_full, y_full)

X_test = test_clean[features]
test_pred = model.predict(X_test)

# Build submission
sub = pd.DataFrame({
    'Item_Identifier': test['Item_Identifier'],
    'Outlet_Identifier': test['Outlet_Identifier'],
    'Item_Outlet_Sales': test_pred
})
sub_path = 'submissions/baseline_linear_regression.csv'
sub.to_csv(sub_path, index=False)
sub.head(), sub_path
