In [1]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

# Load the datasets
train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CSI/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CSI/test.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CSI/sample_submission.csv')

# Display the first few rows of the datasets
print("Train Data:")
print(train_data.head())
print("\nTest Data:")
print(test_data.head())
print("\nSample Submission:")
print(sample_submission.head())

# Check for missing values and data types in the train data
print("\nTrain Data Info:")
print(train_data.info())
print("\nTest Data Info:")
print(test_data.info())


Mounted at /content/drive
Train Data:
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondit

In [2]:
# Fill missing values in the training data
train_data['LotFrontage'] = train_data['LotFrontage'].fillna(train_data['LotFrontage'].median())
train_data['MasVnrArea'] = train_data['MasVnrArea'].fillna(train_data['MasVnrArea'].median())
train_data['GarageYrBlt'] = train_data['GarageYrBlt'].fillna(train_data['GarageYrBlt'].median())

# Fill missing values in categorical columns with mode
for column in train_data.select_dtypes(include=[object]).columns:
    train_data[column] = train_data[column].fillna(train_data[column].mode()[0])

# Fill missing values in the test data using the same strategy
test_data['LotFrontage'] = test_data['LotFrontage'].fillna(test_data['LotFrontage'].median())
test_data['MasVnrArea'] = test_data['MasVnrArea'].fillna(test_data['MasVnrArea'].median())
test_data['GarageYrBlt'] = test_data['GarageYrBlt'].fillna(test_data['GarageYrBlt'].median())

for column in test_data.select_dtypes(include=[object]).columns:
    test_data[column] = test_data[column].fillna(test_data[column].mode()[0])

# Drop columns with too many missing values (example: 'Alley', 'PoolQC', etc.)
train_data = train_data.drop(columns=['Alley', 'PoolQC', 'Fence', 'MiscFeature'])
test_data = test_data.drop(columns=['Alley', 'PoolQC', 'Fence', 'MiscFeature'])


In [3]:
import numpy as np

# Log transformation for skewed features
train_data['Log_GrLivArea'] = np.log1p(train_data['GrLivArea'])
test_data['Log_GrLivArea'] = np.log1p(test_data['GrLivArea'])

# Create new features (example: total number of bathrooms)
train_data['Total_Bathrooms'] = train_data['FullBath'] + 0.5 * train_data['HalfBath'] + train_data['BsmtFullBath'] + 0.5 * train_data['BsmtHalfBath']
test_data['Total_Bathrooms'] = test_data['FullBath'] + 0.5 * test_data['HalfBath'] + test_data['BsmtFullBath'] + 0.5 * test_data['BsmtHalfBath']


In [4]:
from sklearn.model_selection import train_test_split

# Define features and target
X = train_data.drop(columns=['SalePrice', 'Id'])
y = train_data['SalePrice']
X_test = test_data.drop(columns=['Id'])

# One-Hot Encoding for categorical variables
X = pd.get_dummies(X)
X_test = pd.get_dummies(X_test)

# Align the train and test data to ensure they have the same columns
X, X_test = X.align(X_test, join='left', axis=1, fill_value=0)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import GradientBoostingRegressor

# Initialize the models
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=0.1)
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)


In [6]:
# Train the Ridge model
ridge.fit(X_train, y_train)

# Train the Lasso model
lasso.fit(X_train, y_train)

# Train the Gradient Boosting Regressor model
gbr.fit(X_train, y_train)


  model = cd_fast.enet_coordinate_descent(


In [7]:
from sklearn.metrics import mean_squared_error

# Predict and evaluate the Ridge model
ridge_pred = ridge.predict(X_val)
ridge_rmse = np.sqrt(mean_squared_error(y_val, ridge_pred))

# Predict and evaluate the Lasso model
lasso_pred = lasso.predict(X_val)
lasso_rmse = np.sqrt(mean_squared_error(y_val, lasso_pred))

# Predict and evaluate the Gradient Boosting Regressor model
gbr_pred = gbr.predict(X_val)
gbr_rmse = np.sqrt(mean_squared_error(y_val, gbr_pred))

print(f"Ridge RMSE: {ridge_rmse}")
print(f"Lasso RMSE: {lasso_rmse}")
print(f"Gradient Boosting RMSE: {gbr_rmse}")


Ridge RMSE: 29453.331949610172
Lasso RMSE: 27829.0928067352
Gradient Boosting RMSE: 27496.664873266778


In [10]:
# Impute missing values in X_test using SimpleImputer
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')  # Replace missing values with the mean of the column
X_test_imputed = imputer.fit_transform(X_test)

# Make predictions on the imputed test set
test_predictions = best_model.predict(X_test_imputed)




In [11]:
# Prepare the submission DataFrame
submission = pd.DataFrame({
    'Id': test_data['Id'],
    'SalePrice': test_predictions
})

# Save the submission to a CSV file
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully!")


Submission file created successfully!
