# Data Science for Biomedical Informatics: Final Project 
## The main question we are trying to answer is: What factors influences Airbnb pricing?
*The dataset we are using is through Kaggle*


By: Varsha Shashidhar + Sriya Nimmagadda

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [30]:
# drop rows with missing critical values
df = pd.read_csv(
    'AB_US_2023.csv',
    encoding='latin1',
    on_bad_lines='skip',
    engine='python'  # ← this is key
)

df_cleaned = df.dropna(subset=['name', 'host_name', 'reviews_per_month', 'last_review'])

# fill missing 'neighbourhood_group' with 'Unknown'
df_cleaned.loc[:, 'neighbourhood_group'] = df_cleaned['neighbourhood_group'].fillna('Unknown')

# remove listings with price <= 0 or extremely high prices
df_cleaned = df_cleaned[(df_cleaned['price'] > 0) & (df_cleaned['price'] <= 1000)]

# remove listings with minimum_nights > 365
df_cleaned = df_cleaned[df_cleaned['minimum_nights'] <= 365]

df_cleaned = df_cleaned.reset_index(drop=True)

df_cleaned

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,city
0,958,"Bright, Modern Garden Unit - 1BR/1BTH",1169,Holly,Unknown,Western Addition,37.770280,-122.43317,Entire home/apt,202.0,2.0,383,2023-02-19,2.31,1.0,128.0,59,San Francisco
1,5858,Creative Sanctuary,8904,Philip And Tania,Unknown,Bernal Heights,37.744740,-122.42089,Entire home/apt,235.0,30.0,111,2017-08-06,0.66,1.0,365.0,0,San Francisco
2,8142,Friendly Room Apt. Style -UCSF/USF - San Franc...,21994,Aaron,Unknown,Haight Ashbury,37.765550,-122.45213,Private room,56.0,32.0,9,2022-10-27,0.09,13.0,365.0,1,San Francisco
3,8339,Historic Alamo Square Victorian,24215,Rosy,Unknown,Western Addition,37.775640,-122.43642,Entire home/apt,575.0,9.0,28,2019-06-28,0.17,2.0,365.0,0,San Francisco
4,8739,"Mission Sunshine, with Private Bath",7149,Ivan & Wendy,Unknown,Mission,37.760300,-122.42197,Private room,110.0,1.0,770,2023-02-25,4.65,2.0,159.0,34,San Francisco
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180002,837369168777380540,Cozy Bedroom for Traveler,417706467,Qiuying,Unknown,Unincorporated Areas,37.631403,-122.40495392301824,Private room,60.0,1.0,2,2023-03-21,2.00,4.0,26.0,2,San Mateo County
180003,837376796422514183,Lovely Bedroom close to SFO,417706467,Qiuying,Unknown,Unincorporated Areas,37.629656,-122.40447596302054,Private room,52.0,1.0,1,2023-03-17,1.00,4.0,24.0,1,San Mateo County
180004,837858369497137832,Cozy bedroom in the heart of Silicon Valley,6887379,Jimena,Unknown,Unincorporated Areas,37.459521,-122.2657135879798,Entire home/apt,59.0,1.0,1,2023-03-11,1.00,3.0,252.0,1,San Mateo County
180005,838031150138816300,"246C - Large Studio w/ Kitchenette, Laundry & ...",250584391,Victor 5,Unknown,San Mateo,37.528251,-122.29782616483044,Entire home/apt,100.0,2.0,1,2023-03-19,1.00,18.0,171.0,1,San Mateo County


In [38]:
# Sanity Check 
df.shape
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180007 entries, 0 to 180006
Data columns (total 18 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   id                              179999 non-null  object 
 1   name                            180007 non-null  object 
 2   host_id                         180007 non-null  object 
 3   host_name                       180007 non-null  object 
 4   neighbourhood_group             180007 non-null  object 
 5   neighbourhood                   180007 non-null  object 
 6   latitude                        180007 non-null  float64
 7   longitude                       180007 non-null  object 
 8   room_type                       180007 non-null  object 
 9   price                           180007 non-null  float64
 10  minimum_nights                  180007 non-null  float64
 11  number_of_reviews               180007 non-null  object 
 12  last_review     