In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor
import random
from tqdm.notebook import tqdm

random.seed(42)
pd.set_option('display.max_columns', None)


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Load the datasets
train_data = pd.read_csv('final_train_data.csv')
test_data = pd.read_csv('final_test_data.csv')

print("Number of rows:", train_data.shape[0])
print("Number of columns:", train_data.shape[1])
print("Number of rows:", test_data.shape[0])
print("Number of columns:", test_data.shape[1])

##############################################################
# Creating a mapping for 'Neighborhood' to integers
# neighborhood_mapping = {neighborhood: idx for idx, neighborhood in enumerate(data['Neighborhood'].unique())}
# data['Neighborhood'] = data['Neighborhood'].map(neighborhood_mapping)
##############################################################

# Preparing data for classifier
X_train = train_data.drop(['Neighborhood', 'SalePrice'], axis=1)  # Features for training
y_train = train_data['Neighborhood']  # Target for training
X_test = test_data.drop(['Neighborhood', 'SalePrice'], axis=1)  # Features for testing
y_test = test_data['Neighborhood']  # Target for testing

# RandomForest Classifier
clf = RandomForestClassifier(random_state=42)
param_grid = {'n_estimators': [100, 200, 300], 'max_depth': [10, 20, 30]}
grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Predicting subgroups for the test dataset using the trained classifier
predicted_subgroups_test = grid_search.predict(X_test)

# Adding the predicted subgroup labels to the test data
test_data['Predicted_Neighborhood_Group'] = predicted_subgroups_test
# Displaying the updated DataFrame with the new subgroup label column
test_data.head()

Number of rows: 1022
Number of columns: 35
Number of rows: 438
Number of columns: 35




Unnamed: 0,MSZoning,LandContour,LandSlope,Neighborhood,Condition1,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,Foundation,BsmtQual,BsmtExposure,BsmtFinType1,GrLivArea,BedroomAbvGr,KitchenQual,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageCars,GarageQual,WoodDeckSF,OpenPorchSF,PoolArea,SaleType,SaleCondition,SalePrice,LotShape_Ordinal,LotArea_Shape_Interaction,TotalBsmtSF,TotalBaths,RoofStyle_Binary,MasVnrType_BrkFace,Predicted_Neighborhood_Group
0,-0.083847,0.31305,0.200282,1,182635.867532,-1.554088,0.013158,-0.687,-0.583271,-0.649749,220069.702128,-0.883501,163645.356209,0.723398,-1.215918,0.150325,-0.733677,-0.939902,-0.338494,140244.790909,0.246309,0.324485,0.098002,-0.757681,-0.696875,-0.067637,175293.827586,178340.128492,120500,-0.707518,-0.272468,-0.354281,-1.503408,0.160503,0.342559,1
1,-3.277335,0.31305,0.200282,2,182635.867532,0.699854,0.914668,0.715591,-0.583271,1.151454,220069.702128,0.640016,163645.356209,0.723398,-0.204162,0.150325,0.803885,-0.939902,-0.338494,140244.790909,0.298782,0.324485,0.098002,-0.757681,-0.095434,-0.067637,175293.827586,178340.128492,155000,-0.707518,-0.481054,-0.764668,1.696265,0.160503,0.342559,4
2,-0.083847,0.31305,0.200282,3,182635.867532,-1.554088,-0.50199,-1.48848,0.889257,-0.649749,152256.397959,-0.883501,163645.356209,0.723398,-0.160173,1.32608,-2.271238,-0.939902,-0.338494,190594.424138,-3.896996,-2.259597,0.098002,-0.757681,-0.696875,-0.067637,175293.827586,178340.128492,118000,-0.707518,-0.171614,1.052759,-0.223539,0.160503,0.342559,3
3,-0.083847,0.31305,0.200282,4,182635.867532,0.699854,0.882471,0.715591,-0.583271,-0.649749,220069.702128,0.640016,205762.862069,-1.208603,0.360368,0.150325,0.803885,-0.939902,-0.338494,190594.424138,0.296684,0.324485,0.098002,0.201049,1.421837,-0.067637,175293.827586,178340.128492,188000,1.053237,0.014646,-0.315197,0.416396,0.160503,0.342559,1
4,-0.083847,0.31305,0.200282,5,182635.867532,-0.802774,-2.272813,-0.737092,-0.583271,-0.649749,152256.397959,-0.883501,163645.356209,0.240397,1.625063,0.150325,-0.733677,0.60196,1.447774,190594.424138,0.237913,0.324485,0.098002,0.376816,-0.696875,-0.067637,117180.0,178340.128492,160000,-0.707518,-0.214058,-0.134431,-0.223539,0.160503,0.342559,2


In [3]:
test_data['Predicted_Neighborhood_Group'].nunique()

22

In [4]:
# Analyze and combine smaller groups
min_samples_threshold = 5
group_counts = test_data['Predicted_Neighborhood_Group'].value_counts()
small_groups = group_counts[group_counts < min_samples_threshold].index
large_groups = group_counts[group_counts >= min_samples_threshold].index

# Randomly combine small groups into other larger groups
for small_group in small_groups:
    selected_large_group = random.choice(large_groups)
    test_data.loc[test_data['Predicted_Neighborhood_Group'] == small_group, 'Predicted_Neighborhood_Group'] = selected_large_group

test_data['Predicted_Neighborhood_Group'].nunique()

15

In [5]:
print("Number of rows:", train_data.shape[0])
print("Number of columns:", train_data.shape[1])
print("Number of rows:", test_data.shape[0])
print("Number of columns:", test_data.shape[1])

Number of rows: 1022
Number of columns: 35
Number of rows: 438
Number of columns: 36


In [6]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# Initialize a dictionary to store models for each group
group_models = {}
# Initialize an array to store predictions for the entire dataset
all_predictions = np.zeros(len(test_data))

# Iterate through each group
for group in tqdm(test_data['Predicted_Neighborhood_Group'].unique()):
    # Filter the train data for the current group
    group_data_train = train_data[train_data['Neighborhood'] == group]
    
    # Features and target variable for training
    X_train_group = group_data_train.drop(['SalePrice', 'Neighborhood'], axis=1)
    y_train_group = group_data_train['SalePrice']

    # Filter the test data for the current group
    group_data_test = test_data[test_data['Predicted_Neighborhood_Group'] == group]
    X_test_group = group_data_test.drop(['SalePrice', 'Neighborhood', 'Predicted_Neighborhood_Group'], axis=1)
    y_test_group = group_data_test['SalePrice']
    
    # Define base models for stacking
    estimators = [
        ('lr', LinearRegression()),
        ('dt', DecisionTreeRegressor(random_state=42)),
        ('rf', RandomForestRegressor(random_state=42)),
        ('svr', SVR())
    ]
    
    # Stacking Regressor
    stack_reg = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(random_state=42))
    stack_reg.fit(X_train_group, y_train_group)
    
    # Predict and evaluate
    y_pred_group = stack_reg.predict(X_test_group)

    # Store the predictions in the all_predictions array
    all_predictions[group_data_test.index] = y_pred_group

    # Assuming y_pred_group and y_test_group are defined and are of the same length
    comparison_df = pd.DataFrame({
        'Actual': y_test_group,
        'Predicted': y_pred_group
    })

    # Optionally, you can reset the index if you want the table to have a simple integer index
    comparison_df = comparison_df.reset_index(drop=True)

    # Print the DataFrame
    print('For group', group)
    print(comparison_df)

    # Store the model and its performance
    mse = mean_squared_error(y_test_group, y_pred_group)
    group_models[group] = {'model': stack_reg, 'mse': mse}

  0%|          | 0/15 [00:00<?, ?it/s]

For group 1
    Actual  Predicted
0   120500  127828.00
1   188000  207901.00
2   236500  243283.65
3   139000  128358.00
4   230000  304109.00
..     ...        ...
68  287000  230149.98
69  224900  218778.00
70  235000  244711.11
71  201000  230078.00
72  148500  165672.81

[73 rows x 2 columns]
For group 4
    Actual  Predicted
0   155000  234467.31
1   172500  233768.66
2   197000  262546.10
3   227875  334733.56
4   303477  344262.34
5   144152  227803.49
6   216000  254154.50
7   245350  333979.78
8   172500  317450.92
9   185000  252450.00
10  160000  234572.97
11  164500  232592.97
12  270000  317577.92
13  306000  349662.56
14  233170  331246.06
15  252000  334732.56
16  232000  331310.06
17  151000  227803.49
18  176000  251922.00
19  229456  251657.50
20  252678  322957.12
21  164700  234482.97
For group 3
     Actual  Predicted
0    118000  130069.00
1    217000  222135.00
2    139000  128036.00
3    234000  184567.00
4    200624  220542.00
..      ...        ...
107  16490

In [7]:
# Display the MSE for each group
for group, info in sorted(group_models.items()):
    print(f"Group {group}: MSE = {info['mse']}")

# Calculate overall MSE for the test dataset
overall_mse = mean_squared_error(test_data['SalePrice'], all_predictions)
print("Overall MSE for the test dataset:", overall_mse)


Group 1: MSE = 547817577.0623534
Group 2: MSE = 837915440.4446155
Group 3: MSE = 501303939.74988925
Group 4: MSE = 5998855581.891991
Group 7: MSE = 1506056180.2680082
Group 10: MSE = 92841758.33333333
Group 11: MSE = 495425565.71613336
Group 12: MSE = 1235422150.642
Group 14: MSE = 11903910814.075033
Group 16: MSE = 1797841387.676491
Group 17: MSE = 540562542.1161842
Group 19: MSE = 5068924710.892716
Group 21: MSE = 2051785998.6363637
Group 22: MSE = 2993410954.214286
Group 23: MSE = 638994090.5
Overall MSE for the test dataset: 1862266850.8671465
