In [2]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# === Step 1: Load the training data ===
train_df = pd.read_csv("train.csv")

# Use only the relevant columns
features = ['GrLivArea', 'BedroomAbvGr', 'FullBath']
target = 'SalePrice'

# Drop rows with missing values in selected columns
train_df = train_df[features + [target]].dropna()

X = train_df[features]
y = train_df[target]

# === Step 2: Train/test split to evaluate ===
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model on validation data
y_pred = model.predict(X_val)
print("Validation MSE:", mean_squared_error(y_val, y_pred))
print("Validation R² Score:", r2_score(y_val, y_pred))
print("Intercept:", model.intercept_)
print("Coefficients:", dict(zip(X.columns, model.coef_)))

# === Step 3: Load test data for prediction ===
test_df = pd.read_csv("test.csv")

# Fill missing values (if any) in test set for the selected features
# Here we'll just fill with the median of each column for simplicity
X_test = test_df[features].fillna(train_df[features].median())

# Predict house prices on test data
test_predictions = model.predict(X_test)

# === Step 4: Prepare submission file ===
submission = pd.DataFrame({
    'Id': test_df['Id'],  # use original test IDs
    'SalePrice': test_predictions
})

# Save to CSV
submission.to_csv("submission.csv", index=False)
print("Predictions saved to submission.csv")


Validation MSE: 2806426667.247852
Validation R² Score: 0.6341189942328374
Intercept: 52261.74862694448
Coefficients: {'GrLivArea': 104.02630701226447, 'BedroomAbvGr': -26655.16535734127, 'FullBath': 30014.32410895662}
Predictions saved to submission.csv
