In [65]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor
import random
from tqdm.notebook import tqdm

random.seed(42)

In [66]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Load the dataset
file_path = 'final_train_data.csv'
data = pd.read_csv(file_path)

##############################################################
# Creating a mapping for 'Neighborhood' to integers
neighborhood_mapping = {neighborhood: idx for idx, neighborhood in enumerate(data['Neighborhood'].unique())}
data['Neighborhood'] = data['Neighborhood'].map(neighborhood_mapping)
##############################################################

# Preparing data for classifier
X = data.drop(['Neighborhood', 'SalePrice'], axis=1)  # Features
y = data['Neighborhood']  # Target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# RandomForest Classifier
clf = RandomForestClassifier(random_state=42)
param_grid = {'n_estimators': [100, 200, 300], 'max_depth': [10, 20, 30]}
grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Predicting subgroups for the entire dataset using the trained classifier
predicted_subgroups = grid_search.predict(X)

# Adding the predicted subgroup labels to the original data
data['Predicted_Neighborhood_Group'] = predicted_subgroups

# Displaying the updated DataFrame with the new subgroup label column
data.head()



Unnamed: 0,MSZoning,LandContour,LandSlope,Neighborhood,Condition1,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,...,SaleType,SaleCondition,SalePrice,LotShape_Ordinal,LotArea_Shape_Interaction,TotalBsmtSF,TotalBaths,RoofStyle_Binary,MasVnrType_BrkFace,Predicted_Neighborhood_Group
0,3,0.303261,0.237226,0,0.289741,0.632187,0.853515,0.624714,2.627172,-0.70691,...,-0.266049,-0.231259,0.473795,-0.698863,-0.214897,0.09688,1.648926,0.148324,0.323828,0
1,3,0.303261,0.237226,1,-3.62284,-1.504842,-1.058671,-1.669661,-0.565387,-0.70691,...,-0.266049,-0.231259,-0.902767,-0.698863,-0.223903,-0.548591,-0.23188,0.148324,0.323828,1
2,3,0.303261,0.237226,2,-3.62284,-0.080156,-0.320634,-1.096067,1.657704,-0.70691,...,-0.266049,-0.231259,-0.063249,-0.698863,0.092126,-0.725989,-0.23188,0.148324,0.323828,2
3,1,0.303261,0.237226,3,0.289741,1.34453,1.121892,0.95931,-0.565387,1.013238,...,-0.266049,-0.231259,0.196014,1.001704,-0.299785,0.502971,0.395055,0.148324,0.323828,3
4,3,0.303261,0.237226,2,0.289741,-0.792499,-0.48837,-1.335064,-0.565387,-0.70691,...,-0.266049,-0.231259,-0.632393,1.001704,-0.065579,-0.418214,-0.23188,0.148324,0.323828,2


In [67]:
data['Predicted_Neighborhood_Group'].nunique()

24

In [68]:
# Analyze and combine smaller groups
min_samples_threshold = 5
group_counts = data['Predicted_Neighborhood_Group'].value_counts()
small_groups = group_counts[group_counts < min_samples_threshold].index
large_groups = group_counts[group_counts >= min_samples_threshold].index

# Randomly combine small groups into other larger groups
for small_group in small_groups:
    selected_large_group = random.choice(large_groups)
    data.loc[data['Predicted_Neighborhood_Group'] == small_group, 'Predicted_Neighborhood_Group'] = selected_large_group

data['Predicted_Neighborhood_Group'].nunique()

23

In [69]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# Initialize a dictionary to store models for each group
group_models = {}
# Initialize an array to store predictions for the entire dataset
all_predictions = np.zeros(len(data))

# Iterate through each group
for group in tqdm(data['Predicted_Neighborhood_Group'].unique()):
    # Filter the data for the current group
    group_data = data[data['Predicted_Neighborhood_Group'] == group]
    
    # Features and target variable
    X_group = group_data.drop(['SalePrice', 'Predicted_Neighborhood_Group'], axis=1)
    y_group = group_data['SalePrice']
    
    # Split data into training and testing sets
    X_train_group, X_test_group, y_train_group, y_test_group = train_test_split(X_group, y_group, test_size=0.2, random_state=42)
    
    # Define base models for stacking
    estimators = [
        ('lr', LinearRegression()),
        ('dt', DecisionTreeRegressor(random_state=42)),
        ('rf', RandomForestRegressor(random_state=42)),
        ('svr', SVR())
    ]
    
    # Stacking Regressor
    stack_reg = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(random_state=42))
    stack_reg.fit(X_train_group, y_train_group)
    
    # Predict and evaluate
    y_pred_group = stack_reg.predict(X_test_group)

    # Store the predictions in the all_predictions array
    all_predictions[X_test_group.index] = y_pred_group

    # Store the model and its performance
    mse = mean_squared_error(y_test_group, y_pred_group)
    group_models[group] = {'model': stack_reg, 'mse': mse}

# Display the MSE for each group
for group, info in group_models.items():
    print(f"Group {group}: MSE = {info['mse']}")

# Calculate overall MSE for the entire dataset
overall_mse = mean_squared_error(data.loc[X_test_group.index, 'SalePrice'], all_predictions[X_test_group.index])
print("Overall MSE for the entire dataset:", overall_mse)


  0%|          | 0/23 [00:00<?, ?it/s]

Group 0: MSE = 0.21344878682857454
Group 1: MSE = 0.16505048966935923
Group 2: MSE = 0.09805515079769278
Group 3: MSE = 0.06600632192790655
Group 4: MSE = 0.16180548042687876
Group 5: MSE = 0.764093904310428
Group 6: MSE = 0.2962700030504634
Group 7: MSE = 0.032495548200580454
Group 8: MSE = 0.021745869901503737
Group 9: MSE = 0.009516352945856402
Group 10: MSE = 0.23236088632427104
Group 11: MSE = 0.2847122663467363
Group 12: MSE = 1.5844931376168325
Group 13: MSE = 8.181727599464837
Group 20: MSE = 0.22061262380203656
Group 14: MSE = 0.13939946796892438
Group 15: MSE = 0.06751360537566654
Group 16: MSE = 0.04713269259015599
Group 18: MSE = 0.5630614160543517
Group 19: MSE = 0.07584266753571042
Group 21: MSE = 0.44150982573496966
Group 17: MSE = 0.21187335324438727
Group 22: MSE = 0.01700228542590251
Overall MSE for the entire dataset: 0.01700228542590251
