In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Introduction

In [None]:
from IPython.display import display, Image
display(Image(filename='/kaggle/input/final-data-science-project/bostonairbnb.png'))

#### Airbnb is an online marketplace for arranging and offering lodging, primarily homestays, or toursim experiences. Since 2008, guests and hosts have used Airbnb to expand on traveling possibilities. It renders an unique and personalized way of experiencing the world and socializing with new people.

#### In this dataset we will look at the listing activity and metrics in Boston, MA for 2019. This data includes all needed information to find out more about hosts, geographical availability, necessary metrics to make predictions and draw conclusions.
#### We will try to model the price to try and learn about Airbnb rentals. Some of the insights from the analysis:

#### 1. What features affect the price? By how much?
#### 2. The popular description words in different price groups.

import the relevant libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import data as dataframe

In [None]:
df = pd.read_csv('/kaggle/input/final-data-science-project/listings.csv')

# Exploratory Data Analysis

copy the dataframe

In [None]:
clean_listing = df.copy()

dropping the columns that are not useful for analysis

In [None]:
columns = ['thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'listing_url', 'host_url',
       'host_thumbnail_url', 'host_picture_url', 'country', 'country_code', 'neighbourhood',
       'smart_location', 'street', 'market', 'first_review', 'last_review', 'state', 'calendar_last_scraped',
       'calendar_updated', 'city', 'scrape_id', 'last_scraped', 'space', 'host_neighbourhood', 
        'neighborhood_overview', 'host_listings_count', 'zipcode', 'is_location_exact', 'host_location',
       'host_total_listings_count']
clean_listing.drop(columns, axis=1, inplace=True)

drop columns with more than half of the values missing

In [None]:
columns = clean_listing.columns[clean_listing.isnull().sum()/clean_listing.shape[0] > 0.5]
clean_listing.drop(columns, axis=1, inplace=True)

fixing data type errors, extracting numbers and changing them to int64

In [None]:
columns = ['host_response_rate', 'host_acceptance_rate', 'price', 'cleaning_fee', 'extra_people']
for col in columns:
    clean_listing[col] = clean_listing[col].str.extract(r'(\d+)')
    clean_listing[col] = clean_listing[col].astype('float64').astype('Int64')
clean_listing[columns].dtypes

changing data type for host_since

In [None]:
clean_listing['host_since'] = pd.to_datetime(clean_listing.host_since)
temp = pd.to_datetime('31/12/2019')

In [None]:
clean_listing['host_len'] = clean_listing.host_since.apply(lambda x: pd.Timedelta(temp-x).days)
clean_listing = clean_listing.drop('host_since', axis=1)

extracting number of amenities

In [None]:
clean_listing['n_amenities'] = clean_listing['amenities'].apply(lambda x: 
                                                                len(x.replace('{', '').replace('{', '').\
                                                                    replace('"', '').split(',')))
clean_listing.drop('amenities', axis=1, inplace=True)

In [None]:
df_num = clean_listing.select_dtypes(include=['int', 'Int64', 'float'])

filling NaN

In [None]:
fill_nan = lambda x: x.fillna(round(x.mean()))
df_num = df_num.apply(fill_nan, axis=0)
df_num = df_num.drop(['latitude', 'longitude'], axis=1).astype(float)

visualizing the price

In [None]:
plt.figure(figsize=(8,8))
sns.distplot(df_num['price'], bins=50, kde=True)
plt.title('Listing Price Distribution');
plt.xlabel('Price($)');
plt.ylabel('Percentage(%)');

we can see that some listed price is 1$ which isn't intuitive

we will also exclude some outliers of the price

In [None]:
df_num = df_num[df_num['price'] != 1]
df_num = df_num[df_num['price'] < 423]

building the correlation matrix

In [None]:
corrmat = df_num.corr()
mask = np.zeros_like(corrmat)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style('white'):
    f, ax = plt.subplots(figsize=(20,20))
    ax = sns.heatmap(corrmat, mask=mask, vmax=.3, square=True, annot=True, fmt='.2f')

price according to bathrooms vs bedrooms visualized as heatmap

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df_num.groupby(['bathrooms', 'bedrooms']).mean()['price'].reset_index().pivot('bathrooms', 'bedrooms', 'price').\
            sort_index(ascending=False), cmap='coolwarm', fmt='.0f', annot=True, linewidths=0.5);

handeling the categorical features

dropping a few description lines for future analysing

In [None]:
df_cat = clean_listing.select_dtypes(include=['object'])
df_cat = df_cat.drop(['name', 'summary', 'description', 'experiences_offered', 'host_about', 'host_verifications', 'host_name'], axis=1)

##### The interaction, house_rules, access can substantial a listing post. Maybe provide the information that would attract more people and pontentially increase the value of the listing. Since there are a lot of missing values in these columns, we categorized them with 't' for with info and 'f' for without info.

finding the not NaN rows

In [None]:
df_cat['with_inter'] = df_cat.interaction.notna().astype('object')
df_cat['with_access']= df_cat.access.notna().astype('object')
df_cat['with_rules'] = df_cat.house_rules.notna().astype('object')
df_cat['with_transit'] = df_cat.transit.notna().astype('object')
df_cat = df_cat.drop(['transit', 'access', 'interaction', 'house_rules'], axis=1)

In [None]:
df_cat = pd.concat([df_cat, clean_listing.price], axis=1)
df_cat = df_cat[df_cat['price'] != 1]
df_cat = df_cat[df_cat['price'] < 423]

getting dummy values for categorical features

In [None]:
df_cat_dummies = pd.get_dummies(df_cat.iloc[:,:-1], dummy_na=False)

concatenate the model

In [None]:
df_mod = pd.concat([df_num, df_cat_dummies], axis=1)

# Popular Word with WordCloud

## Popular Word

trying to exctract the dataframe for word analysis

In [None]:
df_word = clean_listing.loc[:,['description', 'price']]
df_word = df_word[df_word.price!=1] 
df_word = df_word[df_word.price<423]

segment price into 2 groups

In [None]:
bin_edges = [0, 84, 145, 205, 417]
bin_name = ['25%', '50%', '75%', '100%']
df_word['price_bin'] = pd.cut(df_word['price'], bins=bin_edges, labels=bin_name)

getting the post content for each price group

In [None]:
p_words = {};
for i in range(len(bin_name)):
    p_words[i] = ''
    df_i = df_word[df_word['price_bin'] == bin_name[i]]['description'].astype(str)
    for j in range(len(df_i)):
        words = df_i.iloc[j].split(' ')
        for word in words:
            p_words[i] += word+' '

getting the top 20 most popular words for each price group

In [None]:
from collections import Counter
import string
counter = {}
occur = {}
stop_words = ['a', 'the', 'and', 'is', 'of', 'with', '', 'in', 'i', 'you', 'for', 'on', 'at', 'this', 'there', 
              'that', 'to', 'from', 'have', 'has', 'we', 'your', 'my', 'are', 'be', 'or', 'will', 'our', 'it',
             'located', 'all', 'as']
for i in range(len(bin_name)):
    words = p_words[i].lower().translate(str.maketrans('', '', string.punctuation)).split(' ')
    counter[i] = Counter([word for word in words if word not in stop_words])
    occur[i] = counter[i].most_common(20)
    
df1 = pd.DataFrame.from_dict(occur[0]).rename(columns={0:'word', 1:'count'})
df2 = pd.DataFrame.from_dict(occur[1]).rename(columns={0:'word', 1:'count'})
df3 = pd.DataFrame.from_dict(occur[2]).rename(columns={0:'word', 1:'count'})
df4 = pd.DataFrame.from_dict(occur[3]).rename(columns={0:'word', 1:'count'})

In [None]:
pd.DataFrame.from_dict(occur).rename(columns={0:'0%-25%', 1:'25%-50%', 2:'50%-75%', 3:'75%-100%'})

visualizing

In [None]:
plt.figure(figsize=(20, 14))
plt.subplot(2, 2, 1)
ax1 = sns.barplot(data=df1, x='word', y='count', palette = 'RdBu')
plt.xticks(rotation=60)
plt.xlabel('')
plt.title('0%-25%', fontsize=12);

plt.subplot(2, 2, 2)
ax2 = sns.barplot(data=df2, x='word', y='count', palette = 'RdBu')
plt.xticks(rotation=60)
plt.xlabel('')
plt.title('25%-50%', fontsize=12)

plt.subplot(2, 2, 3)
ax2 = sns.barplot(data=df3, x='word', y='count', palette = 'RdBu')
plt.xticks(rotation=60)
plt.title('50%-75%', fontsize=12)

plt.subplot(2, 2, 4)
ax2 = sns.barplot(data=df4, x='word', y='count', palette = 'RdBu')
plt.xticks(rotation=60)
plt.title('75%-100%', fontsize=12)

plt.suptitle('Popular Words in Different Price Group');

#### There a a lot of overlappings in different price groups, one thing that is interesting is that the higher priced listings contain 'restuarants' more frequently, while lower price listings emphasize transportation using words such as 'bus' and 'line'.

# Model Build

### Feature selection and Linear models

importing the relevant sklearn libraries

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, Ridge, Lasso
from sklearn.feature_selection import RFE
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures

getting feature and label

In [None]:
X = df_mod.drop('price', axis=1)
y = df_mod['price']

### Linear Regression

we noticed that we have too many features so we decided to take the top 30 with the RFE model

In [None]:
n = np.arange(5, 100, 5)
model = LinearRegression()
R2 = []

# initializing the RFE model
for num in n:
    rfe = RFE(model, num)
    
    # transforming data with RFE
    X_rfe = rfe.fit_transform(X, y)
    
    # fitting the transformed data to model
    model.fit(X_rfe, y)
    
    #generating new feature matrix
    X_new = X.iloc[:, rfe.support_]
    
    # split train and test
    X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)
    
    # fitting the data
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    pred = lr.predict(X_test)
    R2.append(metrics.r2_score(y_test, pred))

In [None]:
ind = np.array(R2).argmax()
f_n = n[ind]
rfe = RFE(model, f_n)
X_rfe = rfe.fit_transform(X, y)
model.fit(X_rfe, y)
X_new = X.iloc[:, rfe.support_]
print('{} features are selected'.format(f_n))

split train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)

fitting the data

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)

evaluating the model

In [None]:
mse = metrics.mean_squared_error(y_test, pred)
r_square = metrics.r2_score(y_test, pred)
print('Mean squared error is {}'.format(mse))
print('R^2 is {}'.format(r_square*100))

creating a function to visualize the distribution of prediction and test

In [None]:
def distributionplot(RedFunction, BlueFunction, RedName, BlueName, Title):
    plt.figure(figsize = (10,10))
    ax1 = sns.distplot(RedFunction, hist=False, color='r', label=RedName)
    ax2 = sns.distplot(BlueFunction, hist=False, color='b', label=BlueName, ax=ax1)
    plt.title(Title)
    plt.xlabel('Price($)')
    plt.show()
    plt.close()

visualizing the results

In [None]:
distributionplot(y_test, pred, 'Actual Values(Train)', 'Predicted Values(Train)', 
                'Distribution Plot of Predicted Values Using Training Data VS Training Data Distribution')

### Ridge Regression

ridge model

In [None]:
RR_square = []
RR_train = []
alpha = [0.0001, 0.001, 0.1, 1, 10, 20]
for a in alpha:
    RidgeModel = Ridge(alpha=a) 
    RidgeModel.fit(X_train, y_train)
    RR_square.append(RidgeModel.score(X_test, y_test))
    RR_train.append(RidgeModel.score(X_train, y_train))

visualizing

In [None]:
plt.figure(figsize=(8, 8))
plt.plot(alpha,RR_square, label='Validation data')
plt.plot(alpha,RR_train, 'r', label='Training Data')
plt.xlabel('alpha')
plt.ylabel('R^2')
plt.ylim(0.5, 0.8)
plt.legend();

choosing a ridge

In [None]:
RidgeModel = Ridge(alpha=5) 
RidgeModel.fit(X_train, y_train)
rr_pred = RidgeModel.predict(X_test)
mse = metrics.mean_squared_error(y_test, rr_pred)
print('Mean squared error is {}'.format(mse))
print('R^2 is: {}'.format(RidgeModel.score(X_test, y_test)*100))

visualizing the results

In [None]:
distributionplot(y_test, rr_pred, 'Actual Values (Train)', 'Predicted Values (Train)', 
                 'Distribution Plot of Predicted Value Using Training Data vs Training Data Distribution')

#### We noticed that the ridge only slightly improved the model

# Conclusion

In [None]:
coef_df = pd.DataFrame(data=list(lr.coef_), index=X_new.columns).reset_index().rename(columns={'index': 'Var', 0: 'Coef'})

#### 1. According to our model, every additional bedroom will cost extra 28.5 dollars, while each additional bathroom will cost extra 2.3 dollars. Each additional guests will cost 6 dollars more.

#### 2. Real bed costs more than alternative options.

#### 3. Neighbourhood areas strongly bias the price, which will be plot next.

#### 4. Superhosts' listings are 6 dollars more expensive on average.

#### 5. Property types strongly affect the price. For example: a camp/RV will cost 150 dollars less on average but a boat will cost 39.7 dollars more on average.

#### 6. Entire room on average will cost 76.6 more than a shared-room.

#### 7. In general, the more strict the cacellation policy is, the more expensive the listing will be. Maybe listings have strict cancellation policy are tend to be more expensive and popular.

visualizing predicted price VS actual price

In [None]:
plt.figure(figsize=(10,10))
sns.regplot(x=y_test, y=rr_pred, color=sns.color_palette()[0])
plt.xlim(0, 430)
plt.title('Predict Model')
plt.xlabel('Test Data')
plt.ylabel('Predictions');

getting the neighborhood

In [None]:
neighborhood = coef_df[coef_df.Var.str.contains('neighbourhood')].sort_values(by='Coef', ascending=False)
neighborhood['Var'] = neighborhood['Var'].apply(lambda x: x.split('_')[2])

visualizing neighborhood affect on price

In [None]:
plt.figure(figsize=(12, 12))
sns.barplot(data=neighborhood, x='Coef', y='Var', palette='Blues_d')
plt.title("Neighborhoods' Effects on Predicted Price");
plt.xlabel('Relative Price($)')
plt.ylabel('Neighborhood')

getting the property type

In [None]:
property_type = coef_df[coef_df.Var.str.contains('property_type')].sort_values(by='Coef', ascending=False)
property_type['Var'] = property_type['Var'].apply(lambda x: x.split('_')[2])

visualizing property type affect on price

In [None]:
plt.figure(figsize=(12, 12))
sns.barplot(data=property_type, x='Coef', y='Var', palette='Blues_d')
plt.title("Property Type's Effects on Predicted Price");
plt.xlabel('Relative Price($)')
plt.ylabel('Property Type')

# Another section of data visualization

### WordCloud

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
stopwords = set(STOPWORDS)
stopwords.update(stop_words)

generating wordcloud for high price group

In [None]:
wordcloud = WordCloud(background_color='white', max_words=1000, contour_width=3, 
                      contour_color='firebrick', stopwords = stopwords)

wordcloud.generate(p_words[3])
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

generating wordcloud for low price group

In [None]:
wordcloud = WordCloud(background_color='white', max_words=1000, contour_width=3, 
                      contour_color='firebrick', stopwords = stopwords)

wordcloud.generate(p_words[0])
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

#### Looking at the wordcloud, it is interesting that the more expensive listings contain more information about the comfort, such as access and city and restaurant, and they also emphasize the location (south end, back bay) more frequently.