In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb

In [None]:
# Load the datasets
train_data_path = '/content/train_new.csv'
test_data_path = '/content/test_new.csv'
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

In [None]:
train_df

Unnamed: 0,SalePrice,PID,Lot Frontage,Lot Area,Street,Neighborhood,Bldg Type,House Style,Overall Qual,Overall Cond,...,Full Bath,Half Bath,Bedroom AbvGr,TotRms AbvGrd,Gr Liv Area,Functional,Screen Porch,Pool Area,Yr Sold,Sale Type
0,159000,531363010,80.0,9605,Pave,SawyerW,1Fam,1Story,7,6,...,1,1,3,6,1218,Typ,0,0,2009,WD
1,271900,906203120,90.0,14684,Pave,SawyerW,1Fam,1Story,7,7,...,2,0,3,7,2196,Typ,0,0,2009,WD
2,137500,916176030,,14375,Pave,Timber,1Fam,SLvl,6,6,...,1,0,3,7,1344,Typ,233,0,2009,COD
3,248500,528180130,48.0,6472,Pave,NridgHt,TwnhsE,1Story,9,5,...,2,0,2,6,1456,Typ,0,0,2009,WD
4,167000,528290030,61.0,9734,Pave,Gilbert,1Fam,SLvl,7,5,...,2,1,3,7,1374,Typ,0,0,2009,WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2192,220000,906420020,80.0,10041,Pave,SawyerW,1Fam,2Story,8,5,...,2,1,3,8,1915,Typ,0,0,2006,WD
2193,160000,909129090,70.0,6300,Pave,SWISU,1Fam,1.5Fin,5,4,...,1,1,3,7,1268,Typ,0,0,2009,WD
2194,225000,528292060,41.0,12460,Pave,Gilbert,1Fam,2Story,7,5,...,2,1,4,8,2322,Typ,0,0,2008,WD
2195,83000,905426060,85.0,10625,Pave,Edwards,1Fam,1Story,5,5,...,1,0,2,5,835,Typ,0,0,2010,COD


In [None]:
test_df

Unnamed: 0,PID,Lot Frontage,Lot Area,Street,Neighborhood,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,...,Full Bath,Half Bath,Bedroom AbvGr,TotRms AbvGrd,Gr Liv Area,Functional,Screen Porch,Pool Area,Yr Sold,Sale Type
0,907135180,60,8070,Pave,CollgCr,1Fam,1Story,4,5,1994,...,1,0,3,5,990,Typ,0,0,2007,WD
1,528181040,40,6792,Pave,NridgHt,TwnhsE,1Story,7,5,2005,...,2,0,2,6,1368,Typ,0,0,2006,New
2,528175010,44,6371,Pave,NridgHt,TwnhsE,1Story,7,5,2009,...,2,0,2,6,1358,Typ,0,0,2010,New
3,531379030,70,8304,Pave,SawyerW,1Fam,2Story,6,5,1997,...,2,1,3,7,1837,Typ,0,0,2006,WD
4,923275090,37,6951,Pave,Mitchel,1Fam,1Story,5,5,1984,...,1,0,3,5,923,Typ,0,0,2008,WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
600,528174060,34,5381,Pave,NridgHt,Twnhs,1Story,6,5,2005,...,2,0,1,5,1306,Typ,0,0,2009,WD
601,903400180,64,13053,Pave,BrkSide,1Fam,1.5Fin,6,7,1923,...,1,1,4,8,1848,Typ,220,0,2008,WD
602,903227150,53,6360,Pave,BrkSide,1Fam,1.5Fin,5,6,1942,...,1,1,3,7,1453,Min2,148,0,2010,WD
603,909250070,43,7000,Pave,SWISU,1Fam,2Story,7,8,1926,...,1,0,3,6,1479,Typ,0,0,2006,WD


In [None]:
# Fill missing values for 'Lot Frontage' with median
median_lot_frontage = train_df['Lot Frontage'].median()
train_df['Lot Frontage'].fillna(median_lot_frontage, inplace=True)

# Fill missing value for 'Electrical' with mode
mode_electrical = train_df['Electrical'].mode()[0]
train_df['Electrical'].fillna(mode_electrical, inplace=True)

# One-hot encoding for categorical variables
categorical_columns = train_df.select_dtypes(include=['object']).columns
train_df_encoded = pd.get_dummies(train_df, columns=categorical_columns)
test_df_encoded = pd.get_dummies(test_df, columns=categorical_columns)

# Ensure 'SalePrice' is not included in the alignment
train_df_encoded, test_df_encoded = train_df_encoded.align(test_df_encoded, join='inner', axis=1)
train_df_encoded['SalePrice'] = train_df['SalePrice']

In [None]:
# Preparing the data for modeling
X_train = train_df_encoded.drop('SalePrice', axis = 1)
y_train = train_df_encoded['SalePrice']
y_train_log = np.log(y_train)

# Define the Random Forest model
rf_model = RandomForestRegressor(random_state=42)

# Evaluate the model using cross-validation
cv_scores = cross_val_score(rf_model, X_train, y_train_log, cv=5, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-cv_scores)
mean_rmse = np.mean(rmse_scores)
std_rmse = np.std(rmse_scores)

print("Mean RMSE:", mean_rmse)
print("Standard Deviation of RMSE:", std_rmse)


Mean RMSE: 0.14474104721530046
Standard Deviation of RMSE: 0.004050064183210967


In [None]:
# Train the model on the entire dataset
rf_model.fit(X_train, y_train_log)

# Predict on the test set
y_test_pred_log = rf_model.predict(test_df_encoded)
y_test_pred = np.exp(y_test_pred_log)  # Transform predictions back to original scale

# Prepare the submission file
PID_test = test_df['PID']
submission_df = pd.DataFrame({'PID': PID_test, 'SalePrice': y_test_pred})

In [None]:
submission_df

Unnamed: 0,PID,SalePrice
0,907135180,131745.521406
1,528181040,197373.502697
2,528175010,214446.853771
3,531379030,201901.744994
4,923275090,127042.295737
...,...,...
600,528174060,194215.317774
601,903400180,191031.979909
602,903227150,134983.543293
603,909250070,154957.682205


In [None]:
submission_file_path = '/content/submission.csv'
submission_df.to_csv(submission_file_path, index=False)

print("Submission file saved:", submission_file_path)

Submission file saved: /content/submission.csv


2nd

In [None]:
# Identify numeric and categorical columns
numeric_cols = train_df.select_dtypes(include=np.number).columns
categorical_cols = train_df.select_dtypes(include='object').columns

# Separate the target variable
y = train_df["SalePrice"]

# Combine the training and test sets for consistent preprocessing
combined_data = pd.concat([train_df, test_df], axis=0)

# Fill missing values for numeric columns with the median
combined_data[numeric_cols] = combined_data[numeric_cols].fillna(combined_data[numeric_cols].median())

# Fill missing values for categorical columns with the most frequent value
combined_data[categorical_cols] = combined_data[categorical_cols].fillna(combined_data[categorical_cols].mode().iloc[0])

# One-hot encode categorical columns
combined_data = pd.get_dummies(combined_data, columns=categorical_cols, drop_first=True)

# Separate the combined data back into training and test sets
X_train = combined_data[:len(train_df)].drop("SalePrice", axis=1)
X_test = combined_data[len(train_df):].drop("SalePrice", axis=1)

# Create a robust scaler
robust_scaler = RobustScaler()

# Fit and transform the training data
X_train_scaled = robust_scaler.fit_transform(X_train)

# Transform the target variable during training
y_log = np.log1p(y)

# Choose a model (XGBoost Regressor in this example)
model_xgb = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100, random_state=42)

# Use KFold cross-validation to make predictions
kf = KFold(n_splits=5, shuffle=True, random_state=42)
y_pred_log = cross_val_predict(model_xgb, X_train_scaled, y_log, cv=kf)

# Transform predictions back to the original scale
y_pred_original_scale = np.expm1(y_pred_log)

# Print the Root Mean Squared Error (RMSE) on the training set
rmse_train = np.sqrt(mean_squared_error(np.log1p(y), y_pred_log))
print(f"RMSE on the training set: {rmse_train}")

# Train the model on the entire training set
model_xgb.fit(X_train_scaled, y_log)

# Transform the test data
X_test_scaled = robust_scaler.transform(X_test)

# Make predictions on the test set
test_preds_log = model_xgb.predict(X_test_scaled)

# Transform predictions back to the original scale
test_preds_original_scale = np.expm1(test_preds_log)

RMSE on the training set: 0.14394120209691588


In [None]:
# Create a submission DataFrame
submission = pd.DataFrame({"PID": test_df["PID"], "SalePrice": test_preds_original_scale})

# Save the submission to a CSV file
submission.to_csv("submission_xgb.csv", index=False)

3rd time

In [None]:
# Identify numeric and categorical columns
numeric_cols = train_df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = train_df.select_dtypes(include='object').columns.tolist()

# Remove 'SalePrice' from numeric_cols if present
if 'SalePrice' in numeric_cols:
    numeric_cols.remove('SalePrice')

# Separate the target variable
y = train_df["SalePrice"]

# Combine the training and test sets for consistent preprocessing
combined_data = pd.concat([train_df.drop("SalePrice", axis=1), test_df], axis=0)

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)])

# Apply preprocessing
X = preprocessor.fit_transform(combined_data[:len(train_df)])

X_test = preprocessor.transform(combined_data[len(train_df):])

# Transform the target variable
y_log = np.log1p(y)

# Choose a model (XGBoost Regressor)
model_xgb = xgb.XGBRegressor(objective = "reg:squarederror", random_state = 42)

# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7]
}

# Perform GridSearchCV
grid_search = GridSearchCV(model_xgb, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X, y_log)

# Best model
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
test_preds_log = best_model.predict(X_test)

# Transform predictions back to the original scale
test_preds_original_scale = np.expm1(test_preds_log)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [None]:
# Create a submission DataFrame
submission = pd.DataFrame({"PID": test_df["PID"], "SalePrice": test_preds_original_scale})

# Save the submission to a CSV file
submission.to_csv("submission_2_xgb.csv", index=False)