In [None]:
import pandas as pd
import numpy as np

#Data Upload and Viewing

In [None]:
# load airbnb data
gz = pd.read_csv('http://data.insideairbnb.com/united-states/ca/los-angeles/2021-04-07/data/listings.csv.gz')

In [None]:
print(gz.shape)
gz.head()

In [None]:
# remove unnecessary columns
cols = gz.columns
gz.columns

In [None]:
# create a list of the columns to save
save = [5,6,27,29,30,35,36, 40,41]
cols_to_save = []

for i in save:
  cols_to_save.append(cols[i])

print(type(cols_to_save[0]))

cols_to_save

In [None]:
# create a new dataframe with only the needed columns
df = gz[cols_to_save].copy()
df.head()

In [None]:
# check data types for all columns
for i in df.columns:
  des = type(df[i][1])
  print(i)
  print(des)

# Data Pre-Processing

In [None]:
df.isnull().sum()

In [None]:
# fill null values
df['description'] = df['description'].fillna("None")
df['neighborhood_overview'] = df['neighborhood_overview'].fillna("None")
df['bathrooms_text'] = df['bathrooms_text'].fillna("1")
df['bedrooms'] = df['bedrooms'].fillna(1)

In [None]:
# see if I missed any
df.isnull().sum()

In [None]:
import re
def fix_bathroom(text):
  """
  Removes anything that is not a number
  Converts and returns the remaining number from a str to float
  """
  text = text.lower()

  if text[-9:] == 'half-bath':
    baths = float('0.5')
  else:
    baths = float(re.sub(r'[^0-9\.]', '', text))

  return baths

def to_float(num):
  return(float(num))


def fix_price(price_str):
  """
  convert price column from string to int
  """

  price = re.sub(r'[^0-9]', '', price_str)
  price = price[:-2]

  return int(price)

In [None]:
# adjust columns to appropriate data format
df['bathrooms'] = df['bathrooms_text'].apply(fix_bathroom)
df['minimum_nights'] = df['minimum_nights'].apply(to_float)
df['target'] = gz['price'].apply(fix_price)

In [None]:
# remove outliers
df = df[(df['target'] >= 25) & (df['target'] <= 2000) & 
        (df['bathrooms'] >=1) & (df['bathrooms'] < 4) &
        (df['bedrooms'] >= 1) & (df['bedrooms'] <= 4) &
        (df['minimum_nights']>=1) & (df['minimum_nights']<=30)]
df.shape

#Model

In [None]:
df.columns

In [None]:
df.drop(columns = ['description', 'neighborhood_overview', 'bathrooms_text'], inplace=True)

In [None]:
df['minimum_nights'].describe()

In [None]:
df.columns

In [None]:
X = df.drop(columns=['target','latitude','longitude', 'neighbourhood_cleansed'])
y = df['target']

In [None]:
!pip install category_encoders==2.*

In [None]:
# one hot encode
import category_encoders as ce

encoder = ce.OneHotEncoder(use_cat_names = True)

X = encoder.fit_transform(X)

In [None]:
X.columns

In [None]:


# # # define base model
# def baseline_model():

#   # create model
#   model = Sequential()
#   model.add(Dense(5, input_dim=3, kernel_initializer='normal', activation='relu'))
#   model.add(Dense(6, kernel_initializer='normal', activation='relu'))
#   model.add(Dense(1, kernel_initializer='normal'))

#   # Compile model
#   model.compile(loss='mean_squared_error', optimizer='adam')

#   return model

# # evaluate model with standardized dataset
# estimators = []
# estimators.append(('standardize', StandardScaler()))
# estimators.append(('mlp', KerasRegressor(build_fn=baseline_model, epochs=10, batch_size=5, verbose=0)))
# pipeline = Pipeline(estimators)
# kfold = KFold(n_splits=5)
# results = cross_val_score(pipeline, X, y, cv=kfold)
# print("Standardized: %.2f (%.2f) MSE" % (results.mean(), results.std()))

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
def create_model():

  # create model
  model = Sequential()
  model.add(Dense(5, input_dim=3, kernel_initializer='normal', activation='relu'))
  model.add(Dense(6, kernel_initializer='normal', activation='relu'))
  model.add(Dense(1, kernel_initializer='normal'))
  # Compile model
  model.compile(loss='mean_squared_error', optimizer='adam')

  return model

# # evaluate model with standardized dataset
# estimators = []
# estimators.append(('standardize', StandardScaler()))
# estimators.append(('mlp', KerasRegressor(build_fn=baseline_model, epochs=50, batch_size=5, verbose=0)))
# pipeline = Pipeline(estimators)
# kfold = KFold(n_splits=5)
# results = cross_val_score(pipeline, X, y, cv=kfold)
# print("Standardized: %.2f (%.2f) MSE" % (results.mean(), results.std()))

In [None]:
model = create_model()

In [None]:
from keras.models import Sequential
from keras.layers import Dense
# from keras.wrappers.scikit_learn import KerasRegressor
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import KFold
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import Pipeline

model = Sequential()
model.add(Dense(5, input_dim=4, kernel_initializer='normal', activation='relu'))
model.add(Dense(6, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
# Compile model
model.compile(loss='mean_squared_error', optimizer='adam')

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.25, random_state=42)

In [None]:
type(y)

In [None]:
model.fit(X, y,
          epochs = 122,
          batch_size=16,
          validation_data=(X_test, y_test)
          # metric='accuracy'
          )

In [None]:
y[0:5]

In [None]:
X.head(5)

In [None]:
model.predict([[2,30,730,2]])

In [None]:
model.predict([[4, 30, 1, 2]])

In [None]:
# zip the model
!zip -r ./nn.zip ./nn_model/

In [None]:
# download to local machine
from google.colab import files
files.download("./nn.zip")

In [None]:
# test uploading saved model
from tensorflow import keras
model = keras.models.load_model('./nn_model')

prediction = model.predict([[2,30,730,2]])
prediction[0][0].round()

#Natural Language Processing

In [None]:
# Start NLP 
from collections import Counter

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# NLP Libraries
import re
from nltk.stem import PorterStemmer
import spacy
from spacy.tokenizer import Tokenizer

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
# Initialize spacy model & tokenizer

nlp = spacy.load('en_core_web_lg')
tokenizer = Tokenizer(nlp.vocab)

In [None]:
# Create tokenize function

def tokenize(text):
    
    tokens = re.sub(r'[^a-zA-Z ^0-9]', ',', text)
    tokens = tokens.lower().replace(',', ' ')
    tokens = tokens.split()
    
    return tokens

def fix_bathroom(text):
  """
  Removes anything that is not a number
  Converts and returns the remaining number from a str to float
  """
  text = text.lower()

  if text[-9:] == 'half-bath':
    baths = float('0.5')
  else:
    baths = float(re.sub(r'[^0-9\.]', '', text))

  return baths


def fix_price(price_str):
  """
  convert price column from string to int
  """

  price = re.sub(r'[^0-9]', '', price_str)
  price = price[:-2]

  return int(price)

In [None]:
df.columns

In [None]:
# convert str prices to int prices
gz['target'] = gz['price'].apply(fix_price)

In [None]:
token_cols

In [None]:
type(df['bathrooms_text'][0])

In [None]:
df['bathrooms']

In [None]:
# tokenize dataframe
token_cols = df.columns

# remove numerical columns
token_cols = token_cols.drop(['latitude','bedrooms','longitude','bathrooms', 'minimum_nights'])

token_cols
for i in token_cols:
  if i == 'bathrooms_text':
    # df['bathrooms'] = df[i].apply(fix_bathroom)
    g=1+1
  else:
    df[i+"_token"] = df[i].apply(tokenize)

In [None]:
df[['bedrooms', 'bathrooms', 'latitude', 'longitude']].describe()

In [None]:
# remove outliers
df['target'] = gz['target']
# df = df[(df['target'] >= 25) & (df['target'] <= 2000) 
#       & (df['bathrooms'] <= 4) & (df['bathrooms'] >= 1)]
#       # (df['bedrooms'] <= 4) & (df['bedrooms'] >= 1)]

df = df[(df['target'] >= 25) & (df['target'] <= 2000) & 
        (df['bathrooms'] >=1) & (df['bathrooms'] < 4) &
        (df['bedrooms'] >= 1) & (df['bedrooms'] <= 4)]

In [None]:
df.head()

In [None]:
# Counter Function - takes a corpus of document and returns dataframe of word counts

from collections import Counter
 
word_counts = Counter()

def count(docs):

        word_counts = Counter()
        appears_in = Counter()
        
        total_docs = len(docs)

        for doc in docs:
            word_counts.update(doc)
            appears_in.update(set(doc))

        temp = zip(word_counts.keys(), word_counts.values())
        
        wc = pd.DataFrame(list(temp), columns = ['word', 'count'])

        wc['rank'] = wc['count'].rank(method='first', ascending=False)
        total = wc['count'].sum()

        wc['pct_total'] = wc['count'].apply(lambda x: x / total)
        
        wc = wc.sort_values(by='rank')
        wc['cul_pct_total'] = wc['pct_total'].cumsum()

        t2 = zip(appears_in.keys(), appears_in.values())
        ac = pd.DataFrame(t2, columns=['word', 'appears_in'])
        wc = ac.merge(wc, on='word')

        wc['appears_in_pct'] = wc['appears_in'].apply(lambda x: x / total_docs)
        
        return wc

In [None]:
wc_neighborhood = count(df['neighbourhood_cleansed_token'])

wc_neighborhood.sort_values(by='rank')

In [None]:
wc_neighborhood.shape

In [None]:
#Cumulative Distribution Plot

sns.lineplot(x='rank', y='cul_pct_total', data=wc_neighborhood);

In [None]:
# inspect some descriptions

df['description'].iloc[0]

In [None]:
df['description'].iloc[1]

In [None]:
df['description'][0]

In [None]:
def clean(text):
  text = (text
    .str.replace('<br /><br />',' ')
    .str.replace('<b>',' ')
    .str.replace('</b><br />',' ')
    # .str.replace('</b><br />',' ')
    .str.replace('*','')
  )

  return text

def clean_description(text):

  cleaned = re.sub(r'[^a-zA-Z]', ',', text)
  cleaned = cleaned.lower().replace(',', ' ')

  return cleaned

In [None]:
df['cleaned_desc'] = df['description'].apply(clean_description)

In [None]:
# inspect cleaned descriptions and look for stop words
df['cleaned_desc'].iloc[0]

In [None]:
# Vectorization

from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate Vectorizer
tfidf = TfidfVectorizer(stop_words='english', 
                        max_features=5000)

# Create a vocabulary and tf-idf score per description
dtm = tfidf.fit_transform(df['cleaned_desc'])

# Get feature names to use as dataframe column headers
general_dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

general_dtm.head()

In [None]:
general_dtm.shape

In [None]:
df.shape

#Seasons

In [None]:
cal = pd.read_csv('http://data.insideairbnb.com/united-states/ca/los-angeles/2021-04-07/data/calendar.csv.gz')

In [None]:
print(cal.shape)
cal.head()

In [None]:
cal[cal['listing_id']==35922]

In [None]:
def date_to_season(date):
  """
  Takes the string of date and returns what season it is in.
  """

  season = ['Winter', 'Spring', 'Summer', 'Fall']
  date_num = int(date[5:7])
  # print(date_num)


  if date_num <=2 or date_num ==12:
    return season[0]

  elif date_num<=5:
    return season[1]

  elif date_num<=8:
    return season[2]

  else:
    return season[3]

  return season[i]

In [None]:
date_to_season(cal['date'][0])

In [None]:
cal.drop(labels = 'Season', axis = 1)

In [None]:
cal['Season'] = cal['date'].apply(date_to_season)
cal.head()

In [None]:
listed = cal.listing_id.unique()
len(listed)

In [None]:
temp = cal[cal['listing_id']==35922]
temp

In [None]:
seasons = temp.Season.unique()
season_str = np.array2string(seasons)
type(season_str)

In [None]:
new_season = list()

new_season.append(season_str)

new_season

In [None]:
def define_seasons(data):
  """
  takes the dataframe and changes the 
  """