In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from IPython.core.interactiveshell import InteractiveShell
from sklearn.model_selection import cross_val_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import statsmodels.api as sm
from sklearn.preprocessing import OneHotEncoder

In [9]:
from google.colab import drive
drive.mount('/content/drive')

%cd '/content/drive/MyDrive/STAT315_FinalProject/'

austin = pd.read_csv("listings-2.csv", index_col=0)
df = pd.DataFrame(austin)


Mounted at /content/drive
/content/drive/MyDrive/STAT315_FinalProject


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

plt.figure(figsize=(10,6))
sns.histplot(df['price'].dropna(), bins=50, kde=True) # Plots the distribution of price

plt.title('Distribution of Listing Prices', fontsize=18)
plt.xlabel('Price ($)', fontsize=14)
plt.ylabel('Number of Listings', fontsize=14)

plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

plt.figure(figsize=(10,6))
sns.histplot(df['price'].dropna(), bins=50, kde=True, log_scale=(True, False)) #Puts the x-axis in log scale so the distribution isn't so clustered

plt.title('Distribution of Listing Prices', fontsize=18)
plt.xlabel('Price ($)', fontsize=14)
plt.ylabel('Number of Listings', fontsize=14)

plt.show()

In [None]:
sns.set(style="whitegrid")

df_filtered = df[df['price'] > 0] # Only keeps listings with positive prices

plt.figure(figsize=(12,8))
sns.histplot(
    data=df_filtered,
    x='price',
    hue='room_type', # color by room type
    bins=50,
    kde=True,
    log_scale=(True, False)
)

plt.title('Distribution of Listing Prices by Room Type (Log Scale)', fontsize=18)
plt.xlabel('Price ($) - Log Scale', fontsize=14)
plt.ylabel('Number of Listings', fontsize=14)

plt.legend(title='Room Type', labels=df_filtered['room_type'].unique())
plt.show()


In [None]:
sns.set(style="whitegrid")

df_zip = df_filtered[df_filtered['neighbourhood'].isin(top_zipcodes)].copy()
df_zip['neighbourhood'] = df_zip['neighbourhood'].astype(str)

plt.figure(figsize=(12,8))
sns.boxplot(
    data=df_zip,
    x='neighbourhood',
    y='price',        
    palette='Set2'   
)

plt.yscale('log')

plt.title('Listing Prices by ZIP Code (Top 5)', fontsize=18)
plt.xlabel('ZIP Code', fontsize=14)
plt.ylabel('Price ($) - Log Scale', fontsize=14)

plt.show()

In [None]:
sns.set(style="whitegrid")

top_zipcodes = df_filtered['neighbourhood'].value_counts().head(5).index.tolist()

df_zip = df_filtered[df_filtered['neighbourhood'].isin(top_zipcodes)].copy()

df_zip['neighbourhood'] = df_zip['neighbourhood'].astype(str)

plt.figure(figsize=(12,8))
sns.histplot(
    data=df_zip,
    x='price',
    hue='neighbourhood',
    bins=50,
    kde=True,
    log_scale=(True, False)
)

plt.title('Distribution of Listing Prices by ZIP Code (Top 5)', fontsize=18)
plt.xlabel('Price ($) - Log Scale', fontsize=14)
plt.ylabel('Number of Listings', fontsize=14)

plt.legend(title='ZIP Code')
plt.show()

In [10]:
df.isnull().any()
df = df.drop(columns=['license', 'host_name', 'name', 'host_id', 'id'], errors='ignore')

df['price'] = df['price'].fillna(df['price'].median())
df['reviews_per_month'] = df['reviews_per_month'].fillna(df['reviews_per_month'].median())

df['log_minimum_nights'] = np.log1p(df['minimum_nights'])
df['log_number_of_reviews'] = np.log1p(df['number_of_reviews'])
df['log_price'] = np.log1p(df['price'])

# found coordinates for downtown austin
downtown_lat, downtown_lon = 30.2711, -97.7437

# found the distance between two coordinates
df['distance_to_center'] = np.sqrt(
    (69 * (df['latitude'] - downtown_lat))**2 +
    (54.6 * (df['longitude'] - downtown_lon))**2
)

df.dropna()

Unnamed: 0_level_0,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,log_minimum_nights,log_number_of_reviews,log_price,distance_to_center
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1


In [11]:
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

encoded_x = enc.fit_transform(df[['room_type']])

column_names = enc.categories_[0]
encoded_df = pd.DataFrame(encoded_x, columns=column_names, index=df.index)

df = pd.concat([df, encoded_df], axis = 1)

X = df[['neighbourhood','latitude','longitude', 'Entire home/apt', 'Hotel room', 'Private room', 'Shared room',
        'latitude', 'log_number_of_reviews', 'calculated_host_listings_count', 'availability_365', 'log_minimum_nights', 'distance_to_center']]

y = df['log_price'].astype(float)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [13]:
def forward_selection(X, y):
    selected_features = []
    while True:
        remaining_features = [f for f in X.columns if f not in selected_features]
        new_pval = pd.Series(index=remaining_features)
        for feature in remaining_features:
            model = sm.OLS(y, sm.add_constant(X[selected_features + [feature]])).fit()
            new_pval[feature] = model.pvalues[feature]
        min_pval = new_pval.min()
        if min_pval < 0.01:
            selected_features.append(new_pval.idxmin())
        else:
            break
    final_model = sm.OLS(y, sm.add_constant(X[selected_features])).fit()

    return selected_features, final_model

selected, model = forward_selection(X_train, y_train)

print("Selected features:", selected)

reg = LinearRegression()
reg.fit(X_train[selected], y_train)
y_pred = reg.predict(X_test[selected])

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))


Selected features: ['Private room', 'log_minimum_nights', 'neighbourhood', 'Shared room', 'longitude', 'log_number_of_reviews', 'availability_365', 'distance_to_center', 'latitude', 'calculated_host_listings_count', 'Entire home/apt', 'Hotel room']
Root Mean Squared Error: 0.6650229040193892


In [14]:
def backward_elimination(X, y, threshold_in=0.01):
    selected_features = list(X.columns)

    while True:
        model = sm.OLS(y, sm.add_constant(X[selected_features])).fit()
        pvalues = model.pvalues.iloc[1:]

        max_pval = pvalues.max()

        if max_pval > threshold_in:
            excluded_feature = pvalues.idxmax()
            selected_features.remove(excluded_feature)
        else:
            break
    final_model = sm.OLS(y, sm.add_constant(X[selected_features])).fit()
    return selected_features, final_model

selected, model = backward_elimination(X_train, y_train)

print("Selected features:", selected)

reg = LinearRegression()
reg.fit(X_train[selected], y_train)
y_pred = reg.predict(X_test[selected])

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))

Selected features: ['neighbourhood', 'latitude', 'longitude', 'Entire home/apt', 'Hotel room', 'Private room', 'Shared room', 'latitude', 'log_number_of_reviews', 'calculated_host_listings_count', 'availability_365', 'log_minimum_nights', 'distance_to_center']
Root Mean Squared Error: 0.6650229040193903


In [15]:
# !pip install print-versions
# from print_versions import print_versions

# import numpy as np
# from pandas import DataFrame

# print_versions(globals())

pandas==2.2.2
seaborn==0.13.2
numpy==2.0.2
mlxtend==0.23.4
statsmodels.api==0.14.4
pip==24.1.2
statsmodels==0.14.4
