In [9]:
from google.colab import files
uploaded = files.upload()


Saving test.csv to test.csv
Saving train.csv to train.csv


In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from scipy.stats import skew
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [12]:
print("Train shape:", train.shape)
print("Test shape:", test.shape)

Train shape: (1460, 81)
Test shape: (1459, 80)


In [13]:
train_ID = train['Id']
test_ID = test['Id']
y = train['SalePrice']
train.drop(['SalePrice'], axis=1, inplace=True)


In [14]:
full_data = pd.concat([train, test], axis=0).reset_index(drop=True)


In [15]:
full_data.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'], axis=1, inplace=True)

In [16]:
for col in full_data.columns:
    if full_data[col].dtype == "object":
        full_data[col] = full_data[col].fillna(full_data[col].mode()[0])
    else:
        full_data[col] = full_data[col].fillna(full_data[col].median())

In [17]:
ordinal_cols = ('BsmtQual', 'BsmtCond', 'GarageFinish', 'GarageQual', 'GarageCond',
                'ExterQual', 'ExterCond', 'HeatingQC', 'KitchenQual', 'LandSlope',
                'LotShape', 'PavedDrive', 'Street', 'CentralAir',
                'Functional', 'BsmtExposure', 'GarageType', 'SaleCondition', 'SaleType')

for col in ordinal_cols:
    lbl = LabelEncoder()
    full_data[col] = lbl.fit_transform(full_data[col].astype(str))

In [18]:
full_data = pd.get_dummies(full_data)


In [19]:
full_data['TotalSF'] = full_data['TotalBsmtSF'] + full_data['1stFlrSF'] + full_data['2ndFlrSF']
full_data['TotalBath'] = (full_data['FullBath'] + 0.5 * full_data['HalfBath'] +
                          full_data['BsmtFullBath'] + 0.5 * full_data['BsmtHalfBath'])
full_data['HouseAge'] = full_data['YrSold'] - full_data['YearBuilt']
full_data['RemodelAge'] = full_data['YrSold'] - full_data['YearRemodAdd']

In [22]:
from scipy.stats import skew

numeric_feats = full_data.select_dtypes(include=['int64', 'float64']).columns

skewed_feats = full_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)

skewed = skewed_feats[abs(skewed_feats) > 0.75].index

for feat in skewed:
    full_data[feat] = np.log1p(full_data[feat])


In [23]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(full_data)

In [33]:
X = scaled_data[:len(y)]
X_test = scaled_data[len(y):]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_valid shape:", X_valid.shape)
print("X_test shape:", X_test.shape)

X_train shape: (1168, 208)
X_valid shape: (292, 208)
X_test shape: (1459, 208)


In [37]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

def evaluate(model, X_valid, y_valid):
    preds = model.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_valid, preds))
    return rmse

models = {}

ridge = Ridge(alpha=10)
ridge.fit(X_train, y_train)
models['Ridge'] = evaluate(ridge, X_valid, y_valid)

lasso = Lasso(alpha=0.001)
lasso.fit(X_train, y_train)
models['Lasso'] = evaluate(lasso, X_valid, y_valid)

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
models['Random Forest'] = evaluate(rf, X_valid, y_valid)

xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb.fit(X_train, y_train)
models['XGBoost'] = evaluate(xgb, X_valid, y_valid)

print("🔍 RMSE Scores:")
for name, score in models.items():
    print(f"{name}: {score:.4f}")

best_model = xgb

test_preds = best_model.predict(X_test)

submission = pd.DataFrame({
    'Id': test_ID,
    'SalePrice': test_preds
})

# Save to CSV
submission.to_csv("submission.csv", index=False)
print("✅ submission.csv created!")


  model = cd_fast.enet_coordinate_descent(


🔍 RMSE Scores:
Ridge: 30801.7612
Lasso: 30863.6437
Random Forest: 29579.0537
XGBoost: 25587.9922
✅ submission.csv created!
