In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as ex
import plotly.graph_objects as go
from collections import Counter
from wordcloud import WordCloud, STOPWORDS

In [None]:
df=pd.read_csv("/kaggle/input/us-airbnb-open-data/AB_US_2020.csv")
df.dtypes

In [None]:
rows=2
cols=5
fig,ax=plt.subplots(rows,cols,figsize=(20,8))
d=df.select_dtypes(include=['float64','int64']).columns
for row in range(rows):
    for col in range(cols):
        ax[row,col].hist(df[d[cols*row+col]])
        ax[row,col].set_xlabel(d[cols*row+col])
plt.show()   

# Outlier removal                
Removing the outliers from the price column. As most of the listings have price less than 200 and more than 40 we remove the other listings for better predictions when trying to predict the price per night for the listings.

In [None]:
sns.distplot(df[(df['price']<180) & (df['price']>40)]['price'],kde=False,bins=10)

In [None]:
df=df[(df['price']<180) & (df['price']>40)]
len(df)

There are still enough listings for us to train a model and predict the prices.

In [None]:
rows=2
cols=5
fig,ax=plt.subplots(rows,cols,figsize=(20,8))
d=df.select_dtypes(include=['float64','int64']).columns
for row in range(rows):
    for col in range(cols):
        ax[row,col].hist(df[d[cols*row+col]])
        ax[row,col].set_xlabel(d[cols*row+col])
plt.show()   

Now the price values are much more evenly distributed and hence we can get fair predictions from our model.

In [None]:
city=df.groupby('city').agg({'id':'count','price':np.mean,'number_of_reviews':['sum',np.mean],'availability_365':np.mean})
city.head()

In [None]:
fig=plt.figure(figsize=(10,10))
ax=sns.barplot(y=city.index,x=city.id['count'])
ax.set_xlabel('Listing from respective cities')
ax.set_title('Number of Listing vs cities')
plt.show()

New York city clearly stands out in the total number of listings from a respective city.

In [None]:
fig=plt.figure(figsize=(10,10))
ax=sns.barplot(y=city.index,x=city.price['mean'])
ax.set_xlabel('Average price per night from respective cities')
ax.set_title('Price vs city')
plt.show()

The prices are fairly evenly distributed given that we have already removed the outliers.

In [None]:
fig=plt.figure(figsize=(10,10))
ax=sns.barplot(y=city.index,x=city.availability_365['mean'])
ax.set_xlabel('Average availablitiy over the year from respective cities')
ax.set_title('Availability vs city')
plt.show()

In [None]:
fig=plt.figure(figsize=(10,10))
ax=sns.barplot(y=city.index,x=city.number_of_reviews['mean'])
ax.set_xlabel('Average number of reviews of listings in city')
ax.set_title('Average number of reviews vs city')
plt.show()

In [None]:
states_dic = {'Asheville':'NC','Austin':'TX','Boston':'MA','Broward County':'FL','Cambridge':'MA','Chicago':'IL','Clark County':'NV','Columbus':'OH','Denver':'CO','Hawaii':'HI','Jersey City':'NJ',
             'Los Angeles':'SC','Nashville':'TN','New Orleans':'MS','New York City':'NY','Oakland':'CA','Pacific Grove':'CA','Portland':'OR','Rhode Island':'RI','Salem':'MA','San Clara Country':'CA',
             'Santa Cruz County':'CA','San Diego':'CA','San Francisco':'CA','San Mateo County':'CA','Seattle':'WA','Twin Cities MSA':'MN','Washington D.C.':'DC'}
df['States']=df['city'].apply(lambda x: states_dic[x])

In [None]:
states=df.groupby("States").agg({'id':'count','price':np.mean,'number_of_reviews':['sum',np.mean],'availability_365':np.mean})

In [None]:
fig=ex.choropleth(locations=states.index,color=states.price['mean'],locationmode='USA-states',title="Average Airbnb price of listings from states",scope='usa',color_continuous_scale=ex.colors.diverging.Portland)
fig.show()

In [None]:
fig=ex.choropleth(locations=states.index,color=states.number_of_reviews['mean'],locationmode='USA-states',title="Average reviews of listings from states",scope='usa',color_continuous_scale=ex.colors.diverging.Portland)
fig.show()

In [None]:
words=[]
for sentence in df['name'].astype("str"):
    s=sentence.split(" ")
    words.extend(s)
stopwords=set(STOPWORDS)
words=" ".join(words)                                    

In [None]:
wordcloud=WordCloud(stopwords=stopwords,min_font_size=10).generate(words)
plt.figure(figsize=(10,10))
plt.imshow(wordcloud)
plt.axis('off')

In [None]:
from tensorflow.keras.preprocessing.text import one_hot,text_to_word_sequence,Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten,Embedding,LSTM,Dropout,Bidirectional
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import mean_squared_error,mean_absolute_error
from tensorflow.keras.optimizers import RMSprop
from sklearn.ensemble import RandomForestRegressor
import nltk
import re 

In [None]:
df['nlp_text']=df['name']+" "+df['room_type']
df['nlp_text']=df['nlp_text'].astype('str').apply(lambda x:x.lower())  #converting to lower case
df['nlp_text']=[re.sub('[^A-Za-z]'," ",x) for x in df['nlp_text']] #removing characters other than alphabets

In [None]:
def remove_stopwords(x):
    h=[]
    x=x.split(" ")
    STOP=stopwords.words("english")
    h=[j for j in x if j not in STOP]
    return " ".join(h)
df['nlp_text']=df['nlp_text'].apply(lambda x:remove_stopwords(x)) #removing the most common words 

In [None]:
s=set()
for x in df['nlp_text']:
    words=x.split(" ")
    for y in words:
        s.add(y)
print("Total number of unique words :",len(s)) 

In [None]:
lengths=[len(x.split(" ")) for x in df['nlp_text']] 
sns.distplot(lengths,kde=False,bins=10)

Most of the listing contain between 0 and 25 words in the name.

In [None]:
np.mean(lengths)

In [None]:
tokenizer=Tokenizer(num_words=25000)
tokenizer.fit_on_texts(s)
sequences=tokenizer.texts_to_sequences(df['nlp_text'])
input_len=10 #keeping only the first 10 words 
sequences=pad_sequences(sequences,input_len,padding='post',truncating='post')

In [None]:
model=Sequential()
model.add(Embedding(25000,64,input_length=input_len))
model.add(Flatten())
model.add(Dense(480,activation='relu'))
model.add(Dense(1))

In [None]:
test=sequences[120000:]
train=sequences[:120000]
y_train=df['price'][:120000]
y_test=df['price'][120000:]

In [None]:
model.compile(metrics='mae',optimizer='adam',loss='mse')
model.fit(train,y_train,epochs=5,batch_size=150,validation_split=0.3)

In [None]:
test_set=df.iloc[120000:]
test_set['predicted']=model.predict(test)

In [None]:
mean_absolute_error(test_set['price'],test_set['predicted'])

In [None]:
df2=df[['minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count','room_type','city']]

In [None]:
dummy=pd.get_dummies(df2[['room_type','city']])
df2=pd.merge(df2,dummy,right_index=True,left_index=True)

In [None]:
df2.drop(['room_type','city'],axis=1,inplace=True)
y=df['price']
df2.replace({np.nan:0},inplace=True)

In [None]:
X_train=df2.iloc[:120000]
X_test=df2.iloc[120000:]
y_train=df['price'][:120000]
y_test=df['price'][120000:]

In [None]:
forest=RandomForestRegressor(max_depth=10,n_estimators=250)
forest.fit(X_train,y_train)

In [None]:
test_set['forest_predicted']=forest.predict(X_test)

In [None]:
mean_absolute_error(test_set['forest_predicted'],y_test)

In [None]:
test_set['average_prediction']=0.5*test_set['forest_predicted']+0.5*test_set['predicted']
mean_absolute_error(test_set['average_prediction'],test_set['price'])

Hence we can say that the actual price per night for the listing could be 26 more or less than the predicted value.

In [None]:
test_set.iloc[:15][['price','predicted','forest_predicted','average_prediction']]

In [None]:
mean_squared_error(test_set['average_prediction'],test_set['price'])**0.5

In [None]:
test_set.to_csv("output1.csv",index=False)