# PRASUNET_ML_01
### Implement a linear regression model to predict the prices of houses based on their square footage and the number of bedrooms and bathrooms.

Installing Required Modules

In [2]:
%pip install scikit-learn xgboost

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


### 1. Libraries

In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

### 2. Load Dataset

In [29]:
train_data = pd.read_csv('../house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('../house-prices-advanced-regression-techniques/test.csv')

### 3. Feature Engineering

In [30]:
train_data['TotalBathrooms'] = train_data['FullBath'] + train_data['HalfBath'] * 0.5
test_data['TotalBathrooms'] = test_data['FullBath'] + test_data['HalfBath'] * 0.5

train_data['TotalSqFootage'] = train_data['GrLivArea'] + train_data['TotalBsmtSF']
test_data['TotalSqFootage'] = test_data['GrLivArea'] + test_data['TotalBsmtSF']

### 4. Feature Selection

In [31]:
features = ['GrLivArea', 'BedroomAbvGr', 'FullBath', 'TotalBathrooms', 'TotalSqFootage']
target = 'SalePrice'

X = train_data[features]
y = train_data[target]


### 5. Spliting the Data

In [32]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)


### 6. Training the Regression Models

In [33]:
models = {
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.1),
    "ElasticNet": ElasticNet(alpha=0.1, l1_ratio=0.5),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=0),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=0),
    "XGBoost": xgb.XGBRegressor(n_estimators=100, random_state=0)
}

### 7. Evaluating the Models (Create a pipeline with imputation and model training)

In [34]:
results = {}
for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('model', model)
    ])
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_valid, predictions))
    results[name] = rmse
    print(f'{name} RMSE: {rmse}')


Ridge RMSE: 61387.19458731824
Lasso RMSE: 61440.863737728716
ElasticNet RMSE: 59512.652284546595
RandomForest RMSE: 41461.34143977407
GradientBoosting RMSE: 38571.74851288642
XGBoost RMSE: 42773.53292010976


### 8. Best Model Selection and Prediction

In [38]:
best_model_name = min(results, key=results.get)
best_model = models[best_model_name]

# Create a final pipeline with the best model
final_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('model', best_model)
])

# Fit the final model on the full training data
final_pipeline.fit(X, y)
X_test = test_data[features]
test_predictions = final_pipeline.predict(X_test)


### 9. Submit Predictions

In [39]:
submission = pd.DataFrame({
    'Id': test_data['Id'],
    'SalePrice': test_predictions
})
submission.to_csv('linear_reg2_submission.csv', index=False)