In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

neighborhood_quality = {
    'NAmes': 5, 'CollgCr': 7, 'OldTown': 4, 'Edwards': 6, 'Somerst': 8, 'NridgHt': 9, 'Sawyer': 5, 'Mitchel': 6
}

train_data['NeighborhoodQuality'] = train_data['Neighborhood'].map(neighborhood_quality)
test_data['NeighborhoodQuality'] = test_data['Neighborhood'].map(neighborhood_quality)

features = ['GrLivArea', 'BedroomAbvGr', 'FullBath', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GarageCars', 'GarageArea',
            'YearBuilt', 'YearRemodAdd', 'LotArea', 'Fireplaces', 'MasVnrArea', 'BsmtFinSF1', 'NeighborhoodQuality']

X = train_data[features]
y = train_data['SalePrice']
X_test = test_data[features]

imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=features)
X_test = pd.DataFrame(imputer.transform(X_test), columns=features)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.1, max_iter=10000),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    mse = mean_squared_error(y_valid, y_pred)
    print(f'{name} Mean Squared Error: {mse}')

best_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
best_model.fit(X_train, y_train)

test_predictions = best_model.predict(X_test_scaled)
submission = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': test_predictions})
submission.to_csv('submission.csv', index=False)


Linear Regression Mean Squared Error: 1542762877.2743092
Ridge Regression Mean Squared Error: 1542496805.5189278
Lasso Regression Mean Squared Error: 1542768479.9953063
Random Forest Mean Squared Error: 884865297.4077332
Gradient Boosting Mean Squared Error: 819480224.9144384
