In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from IPython.core.interactiveshell import InteractiveShell
from sklearn.model_selection import cross_val_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import statsmodels.api as sm
from sklearn.preprocessing import OneHotEncoder

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd '/content/drive/MyDrive/STAT315_FinalProject/'

austin = pd.read_csv("listings-2.csv", index_col=0)
df = pd.DataFrame(austin)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/STAT315_FinalProject


In [None]:
df.isnull().any()
del df['host_name']
del df['neighbourhood_group']
df['price'] = df['price'].fillna(df['price'].median())
df['reviews_per_month'] = df['reviews_per_month'].fillna(df['reviews_per_month'].median())
df.dropna()

Unnamed: 0_level_0,name,host_id,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1


In [None]:
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

encoded_x = enc.fit_transform(df[['room_type']])

column_names = enc.categories_[0]
encoded_df = pd.DataFrame(encoded_x, columns=column_names, index=df.index)

df = pd.concat([df, encoded_df], axis = 1)

df

X = df[['neighbourhood','latitude','longitude', 'Entire home/apt', 'Hotel room', 'Private room', 'Shared room',
        'latitude', 'number_of_reviews', 'calculated_host_listings_count', 'availability_365', 'number_of_reviews_ltm']]

y = df['price'].astype(float)

In [None]:
def forward_selection(X, y):
    selected_features = []
    while True:
        remaining_features = [f for f in X.columns if f not in selected_features]
        new_pval = pd.Series(index=remaining_features)
        for feature in remaining_features:
            model = sm.OLS(y, sm.add_constant(X[selected_features + [feature]])).fit()
            new_pval[feature] = model.pvalues[feature]
        min_pval = new_pval.min()
        if min_pval < 0.05:  # Adjust threshold as needed
            selected_features.append(new_pval.idxmin())
        else:
            break
    final_model = sm.OLS(y, sm.add_constant(X[selected_features])).fit()

    return selected_features, final_model

selected, model = forward_selection(X, y)

print("Selected features:", selected)
print("\nModel Summary:\n")
print(model.summary())

Selected features: ['availability_365', 'Private room', 'number_of_reviews_ltm', 'longitude', 'neighbourhood', 'latitude', 'calculated_host_listings_count', 'Entire home/apt']

Model Summary:

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.031
Model:                            OLS   Adj. R-squared:                  0.030
Method:                 Least Squares   F-statistic:                     60.84
Date:                Mon, 28 Apr 2025   Prob (F-statistic):           1.91e-98
Time:                        04:50:54   Log-Likelihood:            -1.2225e+05
No. Observations:               15431   AIC:                         2.445e+05
Df Residuals:                   15422   BIC:                         2.446e+05
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 

ValueError: Found input variables with inconsistent numbers of samples: [8, 15431]

In [None]:
def backward_elimination(X, y, threshold_in=0.05):
    selected_features = list(X.columns)

    while True:
        model = sm.OLS(y, sm.add_constant(X[selected_features])).fit()
        pvalues = model.pvalues.iloc[1:]

        max_pval = pvalues.max()

        if max_pval > threshold_in:
            excluded_feature = pvalues.idxmax()
            selected_features.remove(excluded_feature)
        else:
            break
    final_model = sm.OLS(y, sm.add_constant(X[selected_features])).fit()
    return selected_features, final_model

selected, model = backward_elimination(X, y)

print("Selected features:", selected)
print("\nModel Summary:\n")
print(model.summary())

In [None]:
def regression_model_cv(model, k=5):
    scores = cross_val_score(model, selected, y, scoring='neg_mean_squared_error', cv=k)
    rmse = np.sqrt(-scores)
    print('Reg rmse:', rmse)
    print('Reg mean:', rmse.mean())

print(regression_model_cv(LinearRegression()))