In [1]:
pip install pandas numpy scikit-learn xgboost

Note: you may need to restart the kernel to use updated packages.




In [2]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [3]:
# Load the datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
# Display basic information about the data
print(train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [5]:
# Fill missing values (simplified strategy)
train.fillna(train.mean(), inplace=True)
test.fillna(test.mean(), inplace=True)

  train.fillna(train.mean(), inplace=True)
  test.fillna(test.mean(), inplace=True)


In [6]:
# Feature Engineering: Dropping unnecessary columns
# (e.g., 'Id' doesn't help with the prediction)
train.drop(['Id'], axis=1, inplace=True)
test_ids = test['Id']  # Keep the test IDs for submission
test.drop(['Id'], axis=1, inplace=True)

In [7]:
# Handle categorical data (One-hot encoding)
train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [8]:
# Align the train and test data (to ensure they have the same features)
train, test = train.align(test, join='left', axis=1)

In [12]:
# Split the data into features (X) and target variable (y)
X = train.drop('SalePrice', axis=1)
y = train['SalePrice']

In [14]:
#Ensure the test set doesn't contain 'SalePrice'
# If 'SalePrice' accidentally got aligned during one-hot encoding, drop it
test = test.drop('SalePrice', axis=1, errors='ignore')

In [15]:
# Train/Test split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Standardize the features (optional but recommended for regression models)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
test = scaler.transform(test)

In [20]:
!pip install --upgrade xgboost





In [23]:
# Build the model
model = XGBRegressor(n_estimators=1000, learning_rate=0.05, random_state=42)

# Fit the model with early stopping
model.fit(
    X_train, y_train,
#     early_stopping_rounds=10,  # stop if performance doesn't improve for 10 rounds
    eval_set=[(X_valid, y_valid)],  # validation set for early stopping
    verbose=False  # avoid printing logs
)

In [24]:

# Make predictions on the test set
predictions = model.predict(test)

In [25]:
# Evaluate model performance on validation set
valid_preds = model.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, valid_preds))
print(f'Validation RMSE: {rmse}')

Validation RMSE: 24962.87897964714


In [27]:
# Prepare the submission file
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': predictions
})

submission

Unnamed: 0,Id,SalePrice
0,1461,126893.281250
1,1462,159281.234375
2,1463,183464.812500
3,1464,191597.515625
4,1465,198342.140625
...,...,...
1454,2915,84635.585938
1455,2916,82319.164062
1456,2917,168150.421875
1457,2918,121334.664062


In [33]:
submission.to_csv('submission.csv', index=False)
print("Submission file created successfully!")

Submission file created successfully!


In [34]:
pd.read_csv('submission.csv')

Unnamed: 0,Id,SalePrice
0,1461,126893.280
1,1462,159281.230
2,1463,183464.810
3,1464,191597.520
4,1465,198342.140
...,...,...
1454,2915,84635.586
1455,2916,82319.164
1456,2917,168150.420
1457,2918,121334.664
