In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.ensemble import IsolationForest
from scipy.stats import zscore
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from patsy import dmatrices
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression

In [4]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

file_path = "austinHousingData.csv"


df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "ericpierce/austinhousingprices",
  file_path,
  # Provide any additional arguments like 
    pandas_kwargs={"sep": ","}
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

print("First 5 records:", df.head())

  from .autonotebook import tqdm as notebook_tqdm
  df = kagglehub.load_dataset(


First 5 records:          zpid          city          streetAddress  zipcode  \
0   111373431  pflugerville   14424 Lake Victor Dr    78660   
1   120900430  pflugerville     1104 Strickling Dr    78660   
2  2084491383  pflugerville    1408 Fort Dessau Rd    78660   
3   120901374  pflugerville     1025 Strickling Dr    78660   
4    60134862  pflugerville  15005 Donna Jane Loop    78660   

                                         description   latitude  longitude  \
0  14424 Lake Victor Dr, Pflugerville, TX 78660 i...  30.430632 -97.663078   
1  Absolutely GORGEOUS 4 Bedroom home with 2 full...  30.432673 -97.661697   
2  Under construction - estimated completion in A...  30.409748 -97.639771   
3  Absolutely darling one story home in charming ...  30.432112 -97.661659   
4  Brimming with appeal & warm livability! Sleek ...  30.437368 -97.656860   

   propertyTaxRate  garageSpaces  hasAssociation  ...  numOfMiddleSchools  \
0             1.98             2            True  ...     

In [5]:
usable_columns = df.select_dtypes(include=['int64', 'float64'])
usable_columns = usable_columns.drop(columns=['zpid'])

In [6]:
usable_columns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15171 entries, 0 to 15170
Data columns (total 33 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   zipcode                     15171 non-null  int64  
 1   latitude                    15171 non-null  float64
 2   longitude                   15171 non-null  float64
 3   propertyTaxRate             15171 non-null  float64
 4   garageSpaces                15171 non-null  int64  
 5   parkingSpaces               15171 non-null  int64  
 6   yearBuilt                   15171 non-null  int64  
 7   latestPrice                 15171 non-null  float64
 8   numPriceChanges             15171 non-null  int64  
 9   latest_salemonth            15171 non-null  int64  
 10  latest_saleyear             15171 non-null  int64  
 11  numOfPhotos                 15171 non-null  int64  
 12  numOfAccessibilityFeatures  15171 non-null  int64  
 13  numOfAppliances             151

In [7]:

X = usable_columns.drop("latestPrice", axis=1)
y = usable_columns["latestPrice"]

iso = IsolationForest(contamination=.01, random_state=42)
outliers = iso.fit_predict(X) == 1
X, y = X[outliers], y[outliers]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()

In [8]:
models = {
    'Linear': LinearRegression()
}

pipelines = {
    name: Pipeline([
        ('scale', scaler),
        ('model', model)
    ])
    for name, model in models.items()
}

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

for name, pipe in pipelines.items():
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred) ** 0.5
    print(f"{name} Regression RMSE: {rmse:.4f}")

Linear Regression RMSE: 222073.4426


In [None]:
def forward_selection(X, y):
    remaining = list(X.columns)
    selected = []
    best_aic = np.inf

    while remaining:
        aic_with_candidates = []
        for candidate in remaining:
            model = sm.OLS(y, sm.add_constant(X[selected + [candidate]])).fit()
            aic_with_candidates.append((model.aic, candidate))
        aic_with_candidates.sort()
        best_new_aic, best_candidate = aic_with_candidates[0]
        if best_new_aic < best_aic:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            best_aic = best_new_aic
        else:
            break
    return selected

forwardfeartures = forward_selection(X, y)
print("Forward selected features:", forwardfeartures)

Forward selected features: ['livingAreaSqFt', 'yearBuilt', 'zipcode', 'propertyTaxRate', 'numOfBathrooms', 'numOfBedrooms', 'numOfStories', 'numOfWaterfrontFeatures', 'latest_saleyear', 'avgSchoolDistance', 'numPriceChanges', 'avgSchoolSize', 'avgSchoolRating', 'longitude', 'numOfHighSchools', 'numOfParkingFeatures', 'numOfMiddleSchools', 'latitude', 'latest_salemonth', 'numOfWindowFeatures', 'garageSpaces', 'numOfCommunityFeatures', 'numOfPatioAndPorchFeatures', 'numOfPhotos', 'numOfElementarySchools', 'numOfPrimarySchools', 'lotSizeSqFt', 'numOfAccessibilityFeatures', 'MedianStudentsPerTeacher']


In [13]:
def backward_elimination(X, y, threshold=0.05):
    X_ = sm.add_constant(X.copy())
    model = sm.OLS(y, X_).fit()
    while True:
        pvals = model.pvalues.drop('const')
        max_p = pvals.max()
        if max_p > threshold:
            worst_feature = pvals.idxmax()
            X_.drop(columns=[worst_feature], inplace=True)
            model = sm.OLS(y, X_).fit()
        else:
            break
    return X_.columns.drop('const').tolist()

backwardfeatures = backward_elimination(X, y)
print("Backward selected features:", backwardfeatures)

Backward selected features: ['zipcode', 'latitude', 'longitude', 'propertyTaxRate', 'garageSpaces', 'yearBuilt', 'numPriceChanges', 'latest_salemonth', 'latest_saleyear', 'numOfPhotos', 'numOfParkingFeatures', 'numOfPatioAndPorchFeatures', 'numOfWaterfrontFeatures', 'numOfWindowFeatures', 'numOfCommunityFeatures', 'livingAreaSqFt', 'numOfPrimarySchools', 'numOfElementarySchools', 'numOfMiddleSchools', 'numOfHighSchools', 'avgSchoolDistance', 'avgSchoolRating', 'avgSchoolSize', 'numOfBathrooms', 'numOfBedrooms', 'numOfStories']


In [None]:
X_scaled = StandardScaler().fit_transform(X)

pca_model = PCA()
X_pca = pca_model.fit_transform(X_scaled)

explained_variance = np.cumsum(pca_model.explained_variance_ratio_)
cumulative_var = np.cumsum(explained_variance)

print(explained_variance)


[0.15043204 0.24901541 0.32271726 0.38546882 0.43780383 0.48910036
 0.52579755 0.56138463 0.59538138 0.62846248 0.66020485 0.69142914
 0.7213336  0.75018604 0.7781485  0.80401081 0.82855746 0.85164471
 0.87149143 0.89125897 0.90856796 0.92408163 0.9371056  0.94914444
 0.96042856 0.97126197 0.98046403 0.98687598 0.99240246 0.99685984
 0.99995898 1.        ]


In [30]:

target_variance = 0.95
cumulative_variance = np.cumsum(pca_model.explained_variance_ratio_)
optimal_components = np.argmax(cumulative_variance >= target_variance) + 1

print(optimal_components)

25


In [None]:


n_components = 25
X_train, X_test, y_train, y_test = train_test_split(X_pca[:, :n_components], y, test_size=0.2, random_state=42)

model_pcr = LinearRegression().fit(X_train, y_train)
y_pred = model_pcr.predict(X_test)
print("PCR RMSE:", mean_squared_error(y_test, y_pred) ** 0.5)

PCR RMSE: 239257.14633359463


In [38]:
pls_model = PLSRegression()
pls_model.fit(X_train, y_train)
y_pred = pls_model.predict(X_test)


print("PLSR RMSE:", mean_squared_error(y_test, y_pred) ** 0.5)

PLSR RMSE: 239262.85281651482
