In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
# linear modelling
import statsmodels.api as sm
from scipy.stats import linregress
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
# metrics
from sklearn.metrics import mean_squared_error
import seaborn as sn
from census import Census

# Census API Key
from config import api_key
c = Census(api_key, year=2017)

In [3]:
df=pd.read_csv("Airbnb_Texas_Rentals.csv")
df

Unnamed: 0.1,Unnamed: 0,average_rate_per_night,bedrooms_count,city,date_of_listing,description,latitude,longitude,title,url
0,1,$27,2,Humble,May-16,Welcome to stay in private room with queen bed...,30.020138,-95.293996,2 Private rooms/bathroom 10min from IAH airport,https://www.airbnb.com/rooms/18520444?location...
1,2,$149,4,San Antonio,Nov-10,"Stylish, fully remodeled home in upscale NW – ...",29.503068,-98.447688,Unique Location! Alamo Heights - Designer Insp...,https://www.airbnb.com/rooms/17481455?location...
2,3,$59,1,Houston,Jan-17,'River house on island close to the city' \nA ...,29.829352,-95.081549,River house near the city,https://www.airbnb.com/rooms/16926307?location...
3,4,$60,1,Bryan,Feb-16,Private bedroom in a cute little home situated...,30.637304,-96.337846,Private Room Close to Campus,https://www.airbnb.com/rooms/11839729?location...
4,5,$75,2,Fort Worth,Feb-17,Welcome to our original 1920's home. We recent...,32.747097,-97.286434,The Porch,https://www.airbnb.com/rooms/17325114?location...
...,...,...,...,...,...,...,...,...,...,...
18254,18255,$60,1,Dallas,Mar-13,An entire 1 bedroom 700+sqft condo in the hear...,32.892303,-96.772049,Quiet comfort living in Dallas,https://www.airbnb.com/rooms/1011576?location=...
18255,18256,$99,2,San Antonio,Jun-15,An inviting 1920's cottage home in a popular u...,29.452893,-98.486756,Midtown Cottage Near Riverwalk,https://www.airbnb.com/rooms/18766940?location...
18256,18257,$13,1,Dallas,Dec-16,Amazing 3BHK Apartment in a picturesque Commun...,33.001955,-96.777615,Room in Dallas!,https://www.airbnb.com/rooms/18719059?location...
18257,18258,$65,2,San Antonio,Oct-16,My quaint and cozy home is conveniently locate...,29.450142,-98.505333,Vibrant Spacious Loft!,https://www.airbnb.com/rooms/18179329?location...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18259 entries, 0 to 18258
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              18259 non-null  int64  
 1   average_rate_per_night  18231 non-null  object 
 2   bedrooms_count          18256 non-null  object 
 3   city                    18259 non-null  object 
 4   date_of_listing         18259 non-null  object 
 5   description             18257 non-null  object 
 6   latitude                18225 non-null  float64
 7   longitude               18225 non-null  float64
 8   title                   18256 non-null  object 
 9   url                     18259 non-null  object 
dtypes: float64(2), int64(1), object(7)
memory usage: 1.4+ MB


In [6]:
df.count()

Unnamed: 0                18259
average_rate_per_night    18231
bedrooms_count            18256
city                      18259
date_of_listing           18259
description               18257
latitude                  18225
longitude                 18225
title                     18256
url                       18259
dtype: int64

In [10]:
clean_df = df.dropna()
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18217 entries, 0 to 18258
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              18217 non-null  int64  
 1   average_rate_per_night  18217 non-null  object 
 2   bedrooms_count          18217 non-null  object 
 3   city                    18217 non-null  object 
 4   date_of_listing         18217 non-null  object 
 5   description             18217 non-null  object 
 6   latitude                18217 non-null  float64
 7   longitude               18217 non-null  float64
 8   title                   18217 non-null  object 
 9   url                     18217 non-null  object 
dtypes: float64(2), int64(1), object(7)
memory usage: 1.5+ MB


In [17]:
# summary data frame
clean_df2 = clean_df.loc[:, ["city", "date_of_listing", "bedrooms_count", "average_rate_per_night", "latitude", "longitude"]]
clean_df2

Unnamed: 0,city,date_of_listing,bedrooms_count,average_rate_per_night,latitude,longitude
0,Humble,May-16,2,$27,30.020138,-95.293996
1,San Antonio,Nov-10,4,$149,29.503068,-98.447688
2,Houston,Jan-17,1,$59,29.829352,-95.081549
3,Bryan,Feb-16,1,$60,30.637304,-96.337846
4,Fort Worth,Feb-17,2,$75,32.747097,-97.286434
...,...,...,...,...,...,...
18254,Dallas,Mar-13,1,$60,32.892303,-96.772049
18255,San Antonio,Jun-15,2,$99,29.452893,-98.486756
18256,Dallas,Dec-16,1,$13,33.001955,-96.777615
18257,San Antonio,Oct-16,2,$65,29.450142,-98.505333


In [18]:
clean_df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18217 entries, 0 to 18258
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    18217 non-null  object 
 1   date_of_listing         18217 non-null  object 
 2   bedrooms_count          18217 non-null  object 
 3   average_rate_per_night  18217 non-null  object 
 4   latitude                18217 non-null  float64
 5   longitude               18217 non-null  float64
dtypes: float64(2), object(4)
memory usage: 996.2+ KB


In [29]:
clean_df2["clean_dates"]=[x+"-2017" for x in clean_df2.date_of_listing]
clean_df2["clean_dates"]=pd.to_datetime(clean_df2["clean_dates"])

clean_df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18217 entries, 0 to 18258
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   city                    18217 non-null  object        
 1   date_of_listing         18217 non-null  object        
 2   bedrooms_count          18217 non-null  object        
 3   average_rate_per_night  18217 non-null  object        
 4   latitude                18217 non-null  float64       
 5   longitude               18217 non-null  float64       
 6   clean_dates             18217 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 1.1+ MB


In [30]:
clean_df2.head()

Unnamed: 0,city,date_of_listing,bedrooms_count,average_rate_per_night,latitude,longitude,clean_dates
0,Humble,May-16,2,$27,30.020138,-95.293996,2017-05-16
1,San Antonio,Nov-10,4,$149,29.503068,-98.447688,2017-11-10
2,Houston,Jan-17,1,$59,29.829352,-95.081549,2017-01-17
3,Bryan,Feb-16,1,$60,30.637304,-96.337846,2017-02-16
4,Fort Worth,Feb-17,2,$75,32.747097,-97.286434,2017-02-17


In [32]:
clean_df2["months"]=[x.month for x in clean_df2.clean_dates]
clean_df2.head()

Unnamed: 0,city,date_of_listing,bedrooms_count,average_rate_per_night,latitude,longitude,clean_dates,months
0,Humble,May-16,2,$27,30.020138,-95.293996,2017-05-16,5
1,San Antonio,Nov-10,4,$149,29.503068,-98.447688,2017-11-10,11
2,Houston,Jan-17,1,$59,29.829352,-95.081549,2017-01-17,1
3,Bryan,Feb-16,1,$60,30.637304,-96.337846,2017-02-16,2
4,Fort Worth,Feb-17,2,$75,32.747097,-97.286434,2017-02-17,2


In [40]:
# Create the bins in which Data will be held
# Bins are  12-1-2016, 7-1-2016, 1-1-2018
bins = [0, 3, 6, 9, 12]

# Create the names for the five bins
group_names = ["First", "Second", "Third", "Fourth"]


In [47]:
clean_df2["Date_Quarter"] = pd.cut(clean_df2["months"], bins, labels=group_names, include_lowest=True)
clean_df2


Unnamed: 0,city,date_of_listing,bedrooms_count,average_rate_per_night,latitude,longitude,clean_dates,months,Date Bin,Date,Date(Quarter),Date_Quarter
0,Humble,May-16,2,$27,30.020138,-95.293996,2017-05-16,5,Second,Second,Second,Second
1,San Antonio,Nov-10,4,$149,29.503068,-98.447688,2017-11-10,11,Fourth,Fourth,Fourth,Fourth
2,Houston,Jan-17,1,$59,29.829352,-95.081549,2017-01-17,1,First,First,First,First
3,Bryan,Feb-16,1,$60,30.637304,-96.337846,2017-02-16,2,First,First,First,First
4,Fort Worth,Feb-17,2,$75,32.747097,-97.286434,2017-02-17,2,First,First,First,First
...,...,...,...,...,...,...,...,...,...,...,...,...
18254,Dallas,Mar-13,1,$60,32.892303,-96.772049,2017-03-13,3,First,First,First,First
18255,San Antonio,Jun-15,2,$99,29.452893,-98.486756,2017-06-15,6,Second,Second,Second,Second
18256,Dallas,Dec-16,1,$13,33.001955,-96.777615,2017-12-16,12,Fourth,Fourth,Fourth,Fourth
18257,San Antonio,Oct-16,2,$65,29.450142,-98.505333,2017-10-16,10,Fourth,Fourth,Fourth,Fourth


In [48]:
# summary data frame
clean_df3 = clean_df2.loc[:, ["city", "Date_Quarter", "bedrooms_count", "average_rate_per_night", "latitude", "longitude"]]
clean_df3

Unnamed: 0,city,Date_Quarter,bedrooms_count,average_rate_per_night,latitude,longitude
0,Humble,Second,2,$27,30.020138,-95.293996
1,San Antonio,Fourth,4,$149,29.503068,-98.447688
2,Houston,First,1,$59,29.829352,-95.081549
3,Bryan,First,1,$60,30.637304,-96.337846
4,Fort Worth,First,2,$75,32.747097,-97.286434
...,...,...,...,...,...,...
18254,Dallas,First,1,$60,32.892303,-96.772049
18255,San Antonio,Second,2,$99,29.452893,-98.486756
18256,Dallas,Fourth,1,$13,33.001955,-96.777615
18257,San Antonio,Fourth,2,$65,29.450142,-98.505333
