In [2]:
from google.colab import files
import pandas as pd

# Upload the dataset
uploaded = files.upload()

# Load the training dataset
train_data = pd.read_csv('train.csv')

# Display the first few rows of the training dataset
train_data.head()


Saving data_description.txt to data_description.txt
Saving sample_submission.csv to sample_submission.csv
Saving test.csv to test.csv
Saving train.csv to train.csv


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# Checking for missing values
missing_values = train_data.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)
print(missing_values)

# Fill categorical columns with "None"
none_cols = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
             'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
             'BsmtFinType2', 'BsmtExposure', 'BsmtFinType1', 'BsmtCond', 'BsmtQual']
train_data[none_cols] = train_data[none_cols].fillna('None')

# Fill GarageYrBlt and MasVnrArea with 0 (numerical columns)
train_data['GarageYrBlt'] = train_data['GarageYrBlt'].fillna(0)
train_data['MasVnrArea'] = train_data['MasVnrArea'].fillna(0)

# Fill LotFrontage with the median value
train_data['LotFrontage'] = train_data['LotFrontage'].fillna(train_data['LotFrontage'].median())

# Fill MasVnrType and Electrical with the mode (most frequent value)
train_data['MasVnrType'] = train_data['MasVnrType'].fillna(train_data['MasVnrType'].mode()[0])
train_data['Electrical'] = train_data['Electrical'].fillna(train_data['Electrical'].mode()[0])

# Check if there are any remaining missing values
missing_values_after = train_data.isnull().sum()
print(missing_values_after[missing_values_after > 0])


PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
BsmtExposure      38
BsmtFinType2      38
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
MasVnrArea         8
Electrical         1
dtype: int64
Series([], dtype: int64)


In [4]:
# Feature Engineering: Create new features based on domain knowledge
train_data['TotalLivingArea'] = train_data['GrLivArea'] + train_data['TotalBsmtSF']
train_data['BathroomsPerBedroom'] = (train_data['FullBath'] + train_data['HalfBath']) / (train_data['BedroomAbvGr'] + 1)
train_data['TotalPorchArea'] = train_data['OpenPorchSF'] + train_data['EnclosedPorch'] + train_data['3SsnPorch'] + train_data['ScreenPorch']
train_data['HouseAge'] = train_data['YrSold'] - train_data['YearBuilt']

# Display the first few rows to see the new features
train_data[['TotalLivingArea', 'BathroomsPerBedroom', 'TotalPorchArea', 'HouseAge']].head()


Unnamed: 0,TotalLivingArea,BathroomsPerBedroom,TotalPorchArea,HouseAge
0,2566,0.75,61,5
1,2524,0.5,0,31
2,2706,0.75,42,7
3,2473,0.25,307,91
4,3343,0.6,84,8


In [5]:
# One-hot encode categorical variables
train_data = pd.get_dummies(train_data)

# Display the first few rows to see the changes
train_data.head()


Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,False,False,False,True,False,False,False,False,True,False
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,False,False,False,True,False,False,False,False,True,False
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,False,False,False,True,False,False,False,False,True,False
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,False,False,False,True,True,False,False,False,False,False
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,False,False,False,True,False,False,False,False,True,False


In [6]:
from sklearn.model_selection import train_test_split

# Define the target variable (SalePrice) and features (all other columns except Id and SalePrice)
X = train_data.drop(['SalePrice', 'Id'], axis=1)
y = train_data['SalePrice']

# Split the data into training and validation sets (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and validation sets
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")


X_train shape: (1168, 305)
X_val shape: (292, 305)
y_train shape: (1168,)
y_val shape: (292,)


In [7]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Scale the data
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Display the shapes of the scaled data to ensure everything is correct
print(f"X_train_scaled shape: {X_train_scaled.shape}")
print(f"X_val_scaled shape: {X_val_scaled.shape}")


X_train_scaled shape: (1168, 305)
X_val_scaled shape: (292, 305)


In [9]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

# Initialize the Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(random_state=42)

# Define the parameter grid to search for the best hyperparameters
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10]
}

# Perform a grid search with cross-validation to find the best parameters
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

# Fit the grid search model
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters
print("Best parameters found: ", grid_search.best_params_)


Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best parameters found:  {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 300}


In [10]:
# Initialize the Gradient Boosting Regressor with the best parameters
best_gb_model = GradientBoostingRegressor(
    learning_rate=0.1,
    max_depth=3,
    min_samples_split=2,
    n_estimators=300,
    random_state=42
)

# Train the model with the optimal parameters on the scaled training data
best_gb_model.fit(X_train_scaled, y_train)


In [12]:
from sklearn.metrics import mean_squared_error, mean_absolute_error


In [13]:
# Predict on the validation data
y_pred_gb = best_gb_model.predict(X_val_scaled)

# Calculate the Mean Squared Error (MSE) and Mean Absolute Error (MAE) for Gradient Boosting
mse_gb = mean_squared_error(y_val, y_pred_gb)
mae_gb = mean_absolute_error(y_val, y_pred_gb)

# Print the performance metrics
print(f"Gradient Boosting Mean Squared Error (MSE): {mse_gb}")
print(f"Gradient Boosting Mean Absolute Error (MAE): {mae_gb}")


Gradient Boosting Mean Squared Error (MSE): 702417374.1551814
Gradient Boosting Mean Absolute Error (MAE): 15844.719877914804


In [14]:
# Load the test dataset
test_data = pd.read_csv('test.csv')

# Fill categorical columns with "None"
test_data[none_cols] = test_data[none_cols].fillna('None')

# Fill GarageYrBlt and MasVnrArea with 0 (numerical columns)
test_data['GarageYrBlt'] = test_data['GarageYrBlt'].fillna(0)
test_data['MasVnrArea'] = test_data['MasVnrArea'].fillna(0)

# Fill LotFrontage with the median value
test_data['LotFrontage'] = test_data['LotFrontage'].fillna(train_data['LotFrontage'].median())

# Fill MasVnrType and Electrical with the mode (most frequent value)
test_data['MasVnrType'] = test_data['MasVnrType'].fillna(test_data['MasVnrType'].mode()[0])
test_data['Electrical'] = test_data['Electrical'].fillna(test_data['Electrical'].mode()[0])

# Feature Engineering: Create new features for test data
test_data['TotalLivingArea'] = test_data['GrLivArea'] + test_data['TotalBsmtSF']
test_data['BathroomsPerBedroom'] = (test_data['FullBath'] + test_data['HalfBath']) / (test_data['BedroomAbvGr'] + 1)
test_data['TotalPorchArea'] = test_data['OpenPorchSF'] + test_data['EnclosedPorch'] + test_data['3SsnPorch'] + test_data['ScreenPorch']
test_data['HouseAge'] = test_data['YrSold'] - test_data['YearBuilt']

# One-hot encode categorical variables in test data
test_data = pd.get_dummies(test_data)

# Ensure that the test dataset has the same columns as the training dataset
missing_cols = set(X.columns) - set(test_data.columns)
for col in missing_cols:
    test_data[col] = 0

# Ensure the test data columns are in the same order as training data
test_data = test_data[X.columns]


In [17]:
# Check for any remaining missing values in the test data
missing_test_values = test_data.isnull().sum()
print(missing_test_values[missing_test_values > 0])


BsmtFinSF1         1
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
BsmtFullBath       2
BsmtHalfBath       2
GarageCars         1
GarageArea         1
TotalLivingArea    1
dtype: int64


In [19]:
# Fill missing values in the test dataset with the median for numerical columns
test_data['BsmtFinSF1'] = test_data['BsmtFinSF1'].fillna(test_data['BsmtFinSF1'].median())
test_data['BsmtFinSF2'] = test_data['BsmtFinSF2'].fillna(test_data['BsmtFinSF2'].median())
test_data['BsmtUnfSF'] = test_data['BsmtUnfSF'].fillna(test_data['BsmtUnfSF'].median())
test_data['TotalBsmtSF'] = test_data['TotalBsmtSF'].fillna(test_data['TotalBsmtSF'].median())
test_data['BsmtFullBath'] = test_data['BsmtFullBath'].fillna(test_data['BsmtFullBath'].median())
test_data['BsmtHalfBath'] = test_data['BsmtHalfBath'].fillna(test_data['BsmtHalfBath'].median())
test_data['GarageCars'] = test_data['GarageCars'].fillna(test_data['GarageCars'].median())
test_data['GarageArea'] = test_data['GarageArea'].fillna(test_data['GarageArea'].median())
test_data['TotalLivingArea'] = test_data['TotalLivingArea'].fillna(test_data['TotalLivingArea'].median())


In [20]:
# Check if there are any remaining missing values
missing_test_values_after = test_data.isnull().sum()
print(missing_test_values_after[missing_test_values_after > 0])


Series([], dtype: int64)


In [23]:
# Scale the test data
X_test_scaled = scaler.transform(test_data)

# Check the shape of the test data
print(f"X_test_scaled shape: {X_test_scaled.shape}")


X_test_scaled shape: (1459, 305)


In [24]:
# Check for any remaining NaN values
print(test_data.isnull().sum())


MSSubClass               0
LotFrontage              0
LotArea                  0
OverallQual              0
OverallCond              0
                        ..
SaleCondition_AdjLand    0
SaleCondition_Alloca     0
SaleCondition_Family     0
SaleCondition_Normal     0
SaleCondition_Partial    0
Length: 305, dtype: int64


In [26]:
# Make predictions using the best Gradient Boosting model
test_predictions = best_gb_model.predict(X_test_scaled)

try:
    # Make predictions using the best Gradient Boosting model
    test_predictions = best_gb_model.predict(X_test_scaled)
    print("Predictions successful!")
    print(test_predictions[:10])  # Print the first few predictions to verify
except Exception as e:
    print(f"An error occurred: {e}")



Predictions successful!
[127727.27202865 164820.65205789 189866.02145978 185249.69001379
 175191.53677886 175003.60187911 170456.83387417 165688.87688353
 190453.14150469 126506.4602211 ]


In [27]:
# Load the test data again to get the 'Id' column
test_data_original = pd.read_csv('test.csv')

# Create a DataFrame for submission
submission = pd.DataFrame({
    'Id': test_data_original['Id'],
    'SalePrice': test_predictions
})

# Display the first few rows of the submission DataFrame
submission.head()


Unnamed: 0,Id,SalePrice
0,1461,127727.272029
1,1462,164820.652058
2,1463,189866.02146
3,1464,185249.690014
4,1465,175191.536779


In [28]:
# Save the submission DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)

# Download the submission file (if running on Google Colab)
files.download('submission.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>