In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.impute import KNNImputer

# Load training data
train_df = pd.read_csv("train.csv")

# Load test data
test_df = pd.read_csv("test.csv")

# Display first few rows
train_df.head()

print(train_df.isnull().sum().sort_values(ascending=False).head(20))  # Top missing values

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageYrBlt       81
GarageCond        81
GarageType        81
GarageFinish      81
GarageQual        81
BsmtFinType2      38
BsmtExposure      38
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
MasVnrArea         8
Electrical         1
Id                 0
dtype: int64


In [2]:
# Separate features and target variable for training
X_train = train_df.drop(columns=['Id', 'SalePrice'], errors='ignore')
y_train = train_df['SalePrice']

# Prepare test data (ensure 'SalePrice' is dropped from the test set)
X_test = test_df.drop(columns=['Id'], errors='ignore')

# Perform one-hot encoding on both training and test sets
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# Align test set with train set (ensure they have the same columns)
X_train, X_test = X_train.align(X_test, join='left', axis=1)

# Handle missing values using KNN Imputer (instead of SimpleImputer)
imputer = KNNImputer(n_neighbors=5)
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Apply Polynomial Features to capture interaction terms (degree 2)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Standardize the features (scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_poly)
X_test_scaled = scaler.transform(X_test_poly)

# Split training data into training and validation sets
X_train_sub, X_valid, y_train_sub, y_valid = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train_sub, y_train_sub)

# Validate the model
y_pred = model.predict(X_valid)
mae = mean_absolute_error(y_valid, y_pred)
print("Validation MAE:", mae)

# Perform cross-validation to get a more reliable estimate of model performance
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='neg_mean_absolute_error')
print("Cross-validation MAE:", -cv_scores.mean())

# Make predictions on the test set
y_test_pred = model.predict(X_test_scaled)

# Create a submission file
submission = pd.DataFrame({'Id': test_df['Id'], 'SalePrice': y_test_pred})
submission.to_csv('submission.csv', index=False)

print("Submission file saved!")

Validation MAE: 19123.809036286973
Cross-validation MAE: 24918.387121881035
Submission file saved!
