In [None]:
import pandas as pd
pd.set_option('max_columns', 100)
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
from collections import Counter
from wordcloud import WordCloud
from datetime import datetime
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.cluster import KMeans, MiniBatchKMeans
from scipy.spatial.distance import cdist
import plotly.graph_objects as go
import folium
import pickle
warnings.filterwarnings('ignore')

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
#data = pd.read_csv('AB_US_2020.csv')
data = pd.read_csv(os.path.join(dirname, filename))

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
# Dropping Duplicate Values
# You can choose any subset of features you want.
data = data.drop_duplicates(subset = ['host_id', 'host_name', 'latitude', 'longitude', 'room_type'])

In [None]:
data.isnull().sum()

In [None]:
plt.figure(figsize = (15, 10))
sns.heatmap(data.isnull().T)
plt.show()

- There are many Null values in "neighbourhood_group" 
- The "last_review" and "reviews_per_month" shows presence of null values in similar pattern
- name and host_name have a few null values

## price column

In [None]:
null_df = data[data.isnull().any(1)]
not_null_df = data[~data.isnull().any(1)]

fig = plt.figure(figsize = (16, 5))

fig.add_subplot(1,2,1)
sns.distplot(data['price'])

fig.add_subplot(1,2,2)
sns.distplot(np.log1p(data['price']))
plt.xlabel('Log price')

plt.show()

In [None]:
data['log_price'] = np.log1p(data['price'])

In [None]:
fig = plt.figure(figsize = (16, 5))

fig.add_subplot(1,2,1)
sns.distplot(np.log1p(data['price']))
plt.title("Overall Price distribution")

fig.add_subplot(1,2,2)
sns.distplot(np.log1p(null_df['price']), label = 'data with null values')
sns.distplot(np.log1p(not_null_df['price']), label = 'data without null values')
plt.legend()
plt.title("Price Distribution among the samples")
plt.show()

In [None]:
numeric_features = data.dtypes[data.dtypes != object].index.tolist()
categorical_features = data.dtypes[data.dtypes == object].index.tolist()

## name column

In [None]:
# handling Null values in "name"
print("Null values before : ", data['name'].isnull().sum())
data['name'] = data['name'].fillna('NULL')
print("Null values after : ", data['name'].isnull().sum())

In [None]:
# Preprocessing Name feature
def preprocess_name(rows):
    sentence = str(rows).lower()
    sentence = re.sub('apt', 'apartment', sentence)
    sentence = re.sub('w/', 'with', sentence)
    sentence = re.sub('br', 'bedroom', sentence)
    sentence = re.sub('bedrms', 'bedroom', sentence)
    sentence = re.sub('blck', 'block', sentence)
    sentence = re.sub('univs', 'university', sentence)
    sentence = re.sub('&', 'and', sentence)
    sentence = re.sub('[+-\/|]', ' ', sentence)
    #sentence = re.sub('\s+', ' ',sentence)
    sentence = re.sub('\'', '', sentence)
    sentence = re.sub('‚òö', '', sentence)
    sentence = re.sub('[!#\"~*)(,.:;?]', ' ', sentence)
    sentence = "".join(re.findall('[a-zA-Z0-9\s]', sentence))
    sentence = re.sub('\s+', ' ',sentence)
    return sentence

data['preprocessed_name'] = data['name'].apply(preprocess_name)

In [None]:
# Deriving basic count based features from "name"
data['name_char_length'] = data['preprocessed_name'].apply(lambda x: len(x.strip()))
data['name_word_length'] = data['preprocessed_name'].apply(lambda x: len(x.split(' ')))

In [None]:
# checking the presence of any relation between the above features with the price
fig = plt.figure(figsize = (16, 15))

fig.add_subplot(3,2,1)
sns.scatterplot(data['name_char_length'], data['price'])

fig.add_subplot(3,2,2)
sns.scatterplot(data['name_word_length'], data['price'])

fig.add_subplot(3,2,3)
sns.distplot(data['name_char_length'])
plt.title("name_char_length Distribution")

fig.add_subplot(3,2,4)
sns.distplot(data['name_word_length'])
plt.title('name_word_length Distribution')

fig.add_subplot(3,2,5)
sns.scatterplot(data['name_char_length'], data['log_price'])

fig.add_subplot(3,2,6)
sns.scatterplot(data['name_word_length'], data['log_price'])


plt.tight_layout()
plt.show()

In [None]:
# lets see the distribution of words in name feature
name_words = []
english_stopwords = stopwords.words('english')
english_stopwords.append('')
for element in data['preprocessed_name'].values:
    name_words.extend(element.split(' '))

name_words = [word for word in name_words if word not in english_stopwords]

In [None]:
print("Total no of words : ", len(name_words))
print("Total unique words : ", len(set(name_words)))

In [None]:
# WordCloud
name_reqs = {i[0] : i[1] for i in Counter(name_words).most_common(100)}
plt.figure(figsize = (16, 8))
wordcloud = WordCloud(width = 1200, height = 800, background_color = 'white').generate_from_frequencies(name_reqs)
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.show()

In [None]:
data['log_name_char_length'] = np.log1p(data['name_char_length'])
data['log_name_word_length'] = np.log1p(data['name_word_length'])

In [None]:
fig = plt.figure(figsize = (16, 5))

fig.add_subplot(1,2,1)
sns.distplot(data['log_name_char_length'])

fig.add_subplot(1,2,2)
sns.distplot(data['log_name_word_length'])

plt.show()

In [None]:
# Filtering out the words which have occured more than 1000 times
# and calculating the amount of times they have appeared in the samples
count = CountVectorizer(stop_words = english_stopwords, min_df = 1000)
name_count_features = count.fit_transform(data['preprocessed_name'])

name_count_features = pd.DataFrame(name_count_features.toarray())
count_vocabs = {v : k for k,v in count.vocabulary_.items()}
name_count_features.columns = name_count_features.columns.map(count_vocabs)

In [None]:
name_count_features.sum(axis = 'rows').sort_values(ascending = False).head(10)

In [None]:
data['important_name_word_count'] = name_count_features.sum(axis = 'columns').values
data['not_important_name_word_count'] = data['name_word_length'] - data['important_name_word_count']

In [None]:
names = []
for sentence in data['preprocessed_name'].values:
    names.extend(sentence.split(' '))
    
names = [word for word in names if word not in english_stopwords]

In [None]:
imp_words = [word[0] for word in Counter(names).most_common(30)]

In [None]:
temp_df = data[["preprocessed_name", "price", "log_price"]]

In [None]:
for word in imp_words:
    temp_df[word] = temp_df['preprocessed_name'].apply(lambda x : 1 if word in x else 0)

In [None]:
words = dict()
words['word'] = []
words['mean_price'] = []
words['median_price'] = []

for word in imp_words:
    words['word'].append(word)
    words['mean_price'].append(temp_df[temp_df[word] == 1]['price'].mean())
    words['median_price'].append(temp_df[temp_df[word] == 1]['price'].median())

In [None]:
words = pd.DataFrame(words)

In [None]:
fig = plt.figure(figsize = (16, 10))

fig.add_subplot(2,1,1)
sns.barplot(words['word'], words['mean_price'], label = "average price of the listings with words")
plt.axhline(data['price'].mean(), linestyle = ":", label = "average mean price of all the listings")
plt.xticks(rotation = 45)
plt.title("Plot showing average price of listings with most frequent words")
plt.legend()

fig.add_subplot(2,1,2)
sns.barplot(words['word'], words['median_price'], label = "median price of the listings with words")
plt.axhline(data['price'].median(), linestyle = ":", label = "median mean price of all the listings")
plt.xticks(rotation = 45)
plt.title("Plot showing Median price of listings with most frequent words")
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize = (16, 5))
sns.distplot(np.log1p(data['price']), label = "Price distribution of overall dataset")
sns.distplot(np.log1p(words['mean_price']), label = "Average Price distribution records with frequent words")
sns.distplot(np.log1p(words['median_price']), label = "Median Price distribution of records with frequent words")
plt.legend()
plt.grid(linestyle = ":")
plt.xlabel("Price Distribution")
plt.title("Price Distribution of Listings having top 30 most frequent words in name vs Overall data")
plt.show()

In [None]:
del temp_df
del words

## host_id column

In [None]:
# relationship between the no of properties owned by the Lister with respect to the overall price
host = dict(data.groupby('host_id')['id'].count())
data['Properties_owned'] = data['host_id'].map(host)
host_to_price = dict(data.groupby('Properties_owned')['price'].mean())
data['Host_to_Price'] = data['Properties_owned'].map(host_to_price)

In [None]:
fig = plt.figure(figsize = (16, 5))

fig.add_subplot(1,2,1)
sns.scatterplot(data['Properties_owned'], data['price'])
plt.xlabel("No of Properties Owned")
plt.grid(linestyle = ":")

fig.add_subplot(1,2,2)
sns.scatterplot(data['Properties_owned'], data['Host_to_Price'])
plt.xlabel('No of Properties Owned')
plt.ylabel('Average Price')
plt.title("Average Price set up by host over all the listings")
plt.grid(linestyle = ":")
plt.show()

## neighbourhood_group column

In [None]:
neighbors = data[['neighbourhood_group', 'price']]

In [None]:
neighbors['neighbourhood_group'] = neighbors['neighbourhood_group'].fillna('Unknown')

In [None]:
nbrs = dict()
nbrs['neighbors'] = []
nbrs['mean_price'] = []

for n in neighbors['neighbourhood_group'].unique().tolist():
    nbrs['neighbors'].append(n)
    nbrs['mean_price'].append(neighbors[neighbors['neighbourhood_group'] == n]['price'].mean())

In [None]:
nbrs = pd.DataFrame(nbrs)
nbrs = nbrs.sort_values('mean_price', ascending = False).reset_index(drop = True)

In [None]:
plt.figure(figsize = (16, 5))
sns.barplot(nbrs['neighbors'], nbrs['mean_price'], label = 'mean price as per neighbourhood')
plt.axhline(data['price'].mean(), label = "overall mean price", linestyle = ":")
plt.axhline(data['price'].median(), label = "overall median price", linestyle = ":", color = 'r')
plt.xticks(rotation = 90)
plt.title("Average price of listing as per the Neighbourhood")
plt.legend()
plt.show()

In [None]:
plt.figure(figsize = (16, 5))
for nbr in nbrs['neighbors'].values.tolist()[:5]:
    sns.distplot(np.log1p(data[data['neighbourhood_group'] == nbr]['price']), hist = False)
sns.distplot(np.log1p(data['price']), label = "Overall Price Distribution")
plt.legend()
plt.grid(linestyle = ":")
plt.title("Price Distribution of top 5 Neighbourhood groups vs Overall Price Distribution")
plt.show()

In [None]:
del neighbors
del nbrs

## neighbourhood column

In [None]:
numbers = []
for v in data['neighbourhood'].values:
    try:
        numbers.append(int(v))
    except:
        pass
    
numbers = [str(e) for e in numbers]

In [None]:
areas = []
for element in data['neighbourhood'].values:
    areas.extend(element.split(' '))
    
areas = [e for e in areas if e not in numbers]

In [None]:
temp_df = data[['neighbourhood', 'price']]

In [None]:
imp_areas = [word[0] for word in Counter(areas).most_common(30)]

In [None]:
for area in imp_areas:
    temp_df[area] = temp_df['neighbourhood'].apply(lambda x : 1 if area in x else 0)

In [None]:
areas = dict()
areas['area'] = []
areas['mean_price'] = []
areas['median_price'] = []

for area in imp_areas:
    areas['area'].append(area)
    areas['mean_price'].append(temp_df[temp_df[area] == 1]['price'].mean())
    areas['median_price'].append(temp_df[temp_df[area] == 1]['price'].median())

In [None]:
areas = pd.DataFrame(areas)

In [None]:
fig = plt.figure(figsize = (16, 10))

fig.add_subplot(2,1,1)
sns.barplot(areas['area'], areas['mean_price'], label = "average price of the listings with areas")
plt.axhline(data['price'].mean(), linestyle = ":", label = "average mean price of all the listings", color = "r")
plt.axhline(data['price'].median(), linestyle = ":", label = "median price of all the listings")
plt.xticks(rotation = 45)
plt.title("Plot showing average price of listings with most frequent areas")
plt.legend()
plt.xticks(rotation = 90)

fig.add_subplot(2,1,2)
sns.barplot(areas['area'], areas['median_price'], label = "median price of the listings with areas")
plt.axhline(data['price'].mean(), linestyle = ":", label = "average price of all the listings", color = "r")
plt.axhline(data['price'].median(), linestyle = ":", label = "median price of all the listings")
plt.xticks(rotation = 45)
plt.title("Plot showing Median price of listings with most frequent words")
plt.legend()
plt.xticks(rotation = 90)

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize = (16, 5))
sns.distplot(np.log1p(data['price']), label = "Price distribution of overall dataset")
sns.distplot(np.log1p(areas['mean_price']), label = "Average Price distribution records with frequent areas")
sns.distplot(np.log1p(areas['median_price']), label = "Median Price distribution of records with frequent areas")
plt.legend()
plt.grid(linestyle = ":")
plt.xlabel("Price Distribution")
plt.title("Price Distribution of Listings having top 30 most frequent areas in Neighbourhood vs Overall data")
plt.show()

In [None]:
del temp_df
del areas

## name and neighbourhood columns

In [None]:
temp_df = data[['name', 'neighbourhood', 'price']]

In [None]:
def return_words(rows):
    words_found = []
    for word in imp_words:
        if word in rows:
            words_found.append(word)
        else:
            pass
    words_found.sort()
    #wf = " ".join(words_found)
    #wf = re.sub("\s+", " ", wf)
    return len(words_found)

temp_df['imp_words'] = temp_df['name'].apply(return_words)


def return_areas(rows):
    areas_found = []
    for area in imp_areas:
        if area in rows:
            areas_found.append(area)
        else:
            pass
    areas_found.sort()
    #af = " ".join(areas_found)
    #af = re.sub("\s+", " ", af)
    return len(areas_found)

temp_df['imp_areas'] = temp_df['neighbourhood'].apply(return_areas)

In [None]:
mean_intersection = pd.pivot_table(data = temp_df, 
                                   index = temp_df['imp_words'], 
                                   columns = temp_df['imp_areas'], 
                                   fill_value = 0, 
                                   aggfunc = np.mean)

median_intersection = pd.pivot_table(data = temp_df, 
                                   index = temp_df['imp_words'], 
                                   columns = temp_df['imp_areas'],
                                   fill_value = 0, 
                                   aggfunc = np.median)

fig = plt.figure(figsize = (16, 8))

fig.add_subplot(1,2,1)
sns.heatmap(mean_intersection, annot = True, fmt = 'g')
plt.xticks(ticks = [0.5, 1.5, 2.5, 3.5, 4.5], labels = ['0', '1', '2', '3', '4'])
plt.ylabel("Count of frequent words in name")
plt.xlabel("Count of frequent areas in neighbourhood")
plt.title("Mean price of listings which have most frequent words and areas")

fig.add_subplot(1,2,2)
sns.heatmap(median_intersection, annot = True, fmt = 'g')
plt.xticks(ticks = [0.5, 1.5, 2.5, 3.5, 4.5], labels = ['0', '1', '2', '3', '4'])
plt.ylabel("Count of frequent words in name")
plt.xlabel("Count of frequent areas in neighbourhood")
plt.title("Median price of listings which have most frequent words and areas")

plt.show()

- From the above heatmap we can assume that a better neighbourhood goes for a better price than from having a flashy name
- The prices are highest around places which have 4 most frequent occuring places in the neighbourhood

In [None]:
# price distribution of listings belonging to famous neighbourhoods
temp_df['log_price'] = np.log1p(temp_df['price'])

fig = plt.figure(figsize = (16, 5))

fig.add_subplot(1,2,1)
sns.boxplot(x = 'imp_areas', y = 'log_price', data = temp_df)

fig.add_subplot(1,2,2)
sns.violinplot(x = 'imp_areas', y = 'log_price', data = temp_df)

plt.show()

## host_name column

In [None]:
# Individual or group renters

In [None]:
temp_df = data[['host_name', 'price']]

In [None]:
temp_df['host_name'] = temp_df['host_name'].fillna('Unknown')

In [None]:
to_look_for = ['&', '+', 'inc', 'and', 'family', 'rentals', 'studio', '/', 'hostel', 'landing', '-', 'by']
individual_renters = []
for name in temp_df['host_name'].values:
    value = sum([1 if elem in name.lower() else 0 for elem in to_look_for])
    individual_renters.append(value)

In [None]:
temp_df['individual_renters'] = individual_renters
temp_df['individual_renters'] = temp_df['individual_renters'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
print("Overall Average Price of a listing : ", temp_df['price'].mean())
print("Overall Average Price of a listing by an Individual : ", temp_df[temp_df['individual_renters'] == 1]['price'].mean())
print("Overall Average Price of a listing by an Non-Individual / Group : ", temp_df[temp_df['individual_renters'] == 0]['price'].mean())

In [None]:
plt.figure(figsize = (16, 5))
sns.distplot(np.log1p(temp_df['price']), hist = False, label = "Overall Price Distribution")
sns.distplot(np.log1p(temp_df[temp_df['individual_renters'] == 1]['price']), hist = False, label = "Overall Individual listings Price Distribution")
sns.distplot(np.log1p(temp_df[temp_df['individual_renters'] == 0]['price']), hist = False, label = "Overall Non-Individual listings Price Distribution")
plt.legend()
plt.grid(linestyle = ":")
plt.xlabel("log_price")
plt.show()

- The Price distribution of listings by group or Non-Individuals are similar as that of Overall Price Distribution
- The Price distribution of listings by Individuals are also similar but the curve is more smoother.

In [None]:
# How many Individual and Non-Individual listings are there?
bar = temp_df['individual_renters'].value_counts()
print(bar)
sns.barplot(bar.index, bar.values / temp_df.shape[0])
plt.xticks(ticks = [0, 1], labels = ['Individuals', 'Non-Individuals'])
plt.show()

- More than 80% of the listings belong to Individual listings, while less than 20% of the listings belong to Groups or Non-Individuals

In [None]:
del temp_df

## latitude and longitude columns

In [None]:
temp_df = data[['latitude', 'longitude', 'price']]

In [None]:
K = np.arange(1, 11)
errors = []

for k in K:
    kmeans = MiniBatchKMeans(n_clusters = k, batch_size = 1000)
    kmeans.fit(temp_df)
    errors.append(kmeans.inertia_)

In [None]:
plt.figure(figsize = (16, 5))
plt.grid(linestyle = ":")
plt.plot(K, errors)
plt.show()

In [None]:
kmeans = MiniBatchKMeans(n_clusters = 3, batch_size = 1000, max_iter = 1000).fit(temp_df[['latitude', 'longitude']])
cluster_labels = kmeans.labels_
cluster_centers = kmeans.cluster_centers_

In [None]:
temp_df['cluster_coordinate_label'] = cluster_labels

In [None]:
def get_cluster_center(rows):
    if rows['cluster_coordinate_label'] == 0:
        return tuple(cluster_centers[0])
    elif rows['cluster_coordinate_label'] == 1:
        return tuple(cluster_centers[1])
    elif rows['cluster_coordinate_label'] == 2:
        return tuple(cluster_centers[2])
    else:
        return -1
temp_df['nearest_cluster_center'] = temp_df.apply(get_cluster_center, axis = 'columns')

In [None]:
plt.figure(figsize = (8, 5))
sns.scatterplot(temp_df['latitude'], temp_df['longitude'])
sns.scatterplot(cluster_centers[:, 0], cluster_centers[:, 1], color = 'red')
plt.show()

In [None]:
m = folium.Map(location=[cluster_centers[0][0], cluster_centers[0][1]], tiles="Stamen Toner",zoom_start = 3.5)
sample = temp_df.sample(1000, random_state = 10)

for i in range(len(cluster_centers)):
    folium.CircleMarker(
        location = [cluster_centers[i][0], cluster_centers[i][1]],
        color = 'red',
        fill_color = 'yellow'
    ).add_to(m)
    
lat, long = sample['latitude'].values, sample['longitude'].values
for i in range(sample.shape[0]):
    folium.CircleMarker(
        location = [lat[i], long[i]],
        #color = 'red',
        #fill_color = 'yellow',
        radius = 3,
        icon=folium.Icon(icon='info-sign', color="red")
    ).add_to(m)
    
m

In [None]:
del sample
del temp_df

## room_type column

In [None]:
temp_df = data[['room_type', 'price']]

In [None]:
bar = temp_df['room_type'].value_counts()
fig = plt.figure(figsize = (15, 5))

fig.add_subplot(1,2,1)
sns.barplot(bar.index, bar.values)
plt.grid(linestyle = ":")

fig.add_subplot(1,2,2)
sns.barplot(bar.index, bar.values / temp_df.shape[0])
plt.grid(linestyle = ":")

plt.show()

In [None]:
# price distribution among different type of options
plt.figure(figsize = (16, 5))
sns.distplot(np.log1p(temp_df[temp_df['room_type'] == 'Entire home/apt']['price']), label = "Entire home/apt", hist = False)
sns.distplot(np.log1p(temp_df[temp_df['room_type'] == 'Private room']['price']), label = "Private room", hist = False)
sns.distplot(np.log1p(temp_df[temp_df['room_type'] == 'Shared room']['price']), label = "Shared room", hist = False)
sns.distplot(np.log1p(temp_df[temp_df['room_type'] == 'Hotel room']['price']), label = "Hotel room", hist = False)
sns.distplot(np.log1p(temp_df['price']))
plt.grid(linestyle = ":")
plt.show()

In [None]:
del temp_df

## neighbourhood and room_type columns

In [None]:
# How price influence with room type and good neighbourhood

In [None]:
temp_df = data[['neighbourhood', 'room_type', 'price']]

In [None]:
temp_df['imp_areas'] = temp_df['neighbourhood'].apply(return_areas)

In [None]:
mean_intersection = pd.pivot_table(index = temp_df['imp_areas'], columns = temp_df['room_type'], 
                                   data = temp_df, fill_value = 0, aggfunc = np.mean)

median_intersection = pd.pivot_table(index = temp_df['imp_areas'], columns = temp_df['room_type'], 
                                   data = temp_df, fill_value = 0, aggfunc = np.median)

In [None]:
fig = plt.figure(figsize = (16, 5))

fig.add_subplot(1,2,1)
sns.heatmap(mean_intersection, annot = True, fmt = 'g')
plt.xticks(ticks = [0.5, 1.5, 2.5, 3.5], labels = ["Entire home/apt", 'Hotel room', 'Private room', 'Shared room'], rotation = 45)
plt.xlabel('Room Type')
plt.ylabel("Rating of the Neighbourhood")

fig.add_subplot(1,2,2)
sns.heatmap(median_intersection, annot = True, fmt = 'g')
plt.xticks(ticks = [0.5, 1.5, 2.5, 3.5], labels = ["Entire home/apt", 'Hotel room', 'Private room', 'Shared room'], rotation = 45)
plt.xlabel('Room Type')
plt.ylabel("Rating of the Neighbourhood")

plt.show()

- Ratings of the neighbourhoods go from 0 to 4 where 0 is the lowest and 4 being the highest.
- From the above heatmaps we can see that the most priced options are those which have a neighbourhood rating of 4 and the listings are of Entire home/apt

In [None]:
del temp_df

## minimum_nights column

In [None]:
# How the prices variate with respect to the minimum no of nights

In [None]:
data['minimum_nights'].isnull().sum()

In [None]:
print("Min no of days in minimum nights : ", data['minimum_nights'].min())
print("Max no of days in minimum nights : ", data['minimum_nights'].max())
print("Mean no of days in minimum nights : ", data['minimum_nights'].mean())
print("Std no of days in minimum nights : ", data['minimum_nights'].std())

In [None]:
# how many records have minimum nights = 100000000
data[data['minimum_nights'] == 100000000]

- There is only one records which has 100000000 number in minimum nights, therefore we can consider it as an outlier.

In [None]:
temp_df = data[['minimum_nights', 'room_type', 'price', 'neighbourhood']]
temp_df = temp_df[temp_df['minimum_nights'] != 100000000]

In [None]:
print("Min no of days in minimum nights : ", temp_df['minimum_nights'].min())
print("Max no of days in minimum nights : ", temp_df['minimum_nights'].max())
print("Mean no of days in minimum nights : ",temp_df['minimum_nights'].mean())
print("Std no of days in minimum nights : ", temp_df['minimum_nights'].std())

- Now we got some reasonable numbers but 1250 / 365 == approx 3.4 years
- There are not many people who are going to rent a place for 3.4 years, so we will count it out as an outlier as well.
- So now to completely avoid these outliers we will take the values only within 0 - 99 quantile range

In [None]:
# quantile from 0 to 100
np.quantile(temp_df['minimum_nights'], np.arange(0, 1.1, 0.1))

In [None]:
# quantile from 90 to 100
np.quantile(temp_df['minimum_nights'], np.arange(0.9, 1.01, 0.01))

In [None]:
# quantile from 95 to 100
np.quantile(temp_df['minimum_nights'], np.arange(0.95, 1.0, 0.01))

In [None]:
# quantile from 99 to 100
np.quantile(temp_df['minimum_nights'], np.arange(0.99, 1.0, 0.001))

- The 99.9th quantile is found to be 91, so will ignore any value which is more than 365 for minimum nights

In [None]:
temp_df = temp_df[temp_df['minimum_nights'] <= 366]

In [None]:
print("Min no of days in minimum nights : ", temp_df['minimum_nights'].min())
print("Max no of days in minimum nights : ", temp_df['minimum_nights'].max())
print("Mean no of days in minimum nights : ",temp_df['minimum_nights'].mean())
print("Std no of days in minimum nights : ", temp_df['minimum_nights'].std())

now we have some reasonable number to proceed with our analysis

In [None]:
fig = plt.figure(figsize = (16, 5))

fig.add_subplot(1,2,1)
sns.distplot(temp_df['minimum_nights'])
plt.grid(linestyle = ":")

fig.add_subplot(1,2,2)
sns.distplot(np.log1p(temp_df['minimum_nights']))
plt.grid(linestyle = ":")

plt.show()

- we can see that most of the values are in between 1 to 100 and some little bumps at around 250 and 350

In [None]:
# minimum nights distribution among different type of options
plt.figure(figsize = (16, 5))
sns.distplot(temp_df[temp_df['room_type'] == 'Entire home/apt']['minimum_nights'], label = "Entire home/apt", hist = False)
sns.distplot(temp_df[temp_df['room_type'] == 'Private room']['minimum_nights'], label = "Private room", hist = False)
sns.distplot(temp_df[temp_df['room_type'] == 'Shared room']['minimum_nights'], label = "Shared room", hist = False)
sns.distplot(temp_df[temp_df['room_type'] == 'Hotel room']['minimum_nights'], label = "Hotel room", hist = False)
sns.distplot(temp_df['minimum_nights'], label = 'Overall')
plt.grid(linestyle = ":")
plt.legend()
plt.title("Minimum Nights distribution among different types of room type options")
plt.show()

In [None]:
# minimum nights distribution among different type of options
plt.figure(figsize = (16, 5))
sns.distplot(np.log1p(temp_df[temp_df['room_type'] == 'Entire home/apt']['minimum_nights']), label = "Entire home/apt", hist = False)
sns.distplot(np.log1p(temp_df[temp_df['room_type'] == 'Private room']['minimum_nights']), label = "Private room", hist = False)
sns.distplot(np.log1p(temp_df[temp_df['room_type'] == 'Shared room']['minimum_nights']), label = "Shared room", hist = False)
sns.distplot(np.log1p(temp_df[temp_df['room_type'] == 'Hotel room']['minimum_nights']), label = "Hotel room", hist = False)
sns.distplot(np.log1p(temp_df['minimum_nights']), label = 'Overall')
plt.grid(linestyle = ":")
plt.legend()
plt.title("Minimum Nights distribution among different types of room type options")
plt.show()

In [None]:
fig = plt.figure(figsize = (16, 5))

fig.add_subplot(1,2,1)
sns.boxplot(x = temp_df['room_type'], y = temp_df['minimum_nights'])

fig.add_subplot(1,2,2)
sns.violinplot(x = temp_df['room_type'], y = temp_df['minimum_nights'])

plt.show()

- A large no of listings tends to give offers for less than 10 days based on above boxplot and violinplot.
- The Private room and Entire home/apt being the most popular choice for short term as well as long term stays.
- There are less listings for Hotel rooms for long stays.
- The Shared rooms are a lot from people who want to stay somewhere around 50-60 days

In [None]:
plt.figure(figsize = (15, 6))
sns.scatterplot(temp_df['minimum_nights'], temp_df['price'])
plt.grid(linestyle = ":")
plt.show()

- The short term stay based listings tends to be more costlier than the long term stays.

In [None]:
temp_df['minimum_nights_bins'] = pd.cut(temp_df['minimum_nights'], bins = [0, 50, 100, 150, 200, 250, 300, 350, 400])

In [None]:
min_nights = pd.pivot_table(index = temp_df['minimum_nights_bins'], 
                            columns = temp_df['room_type'], 
                            data = temp_df,
                            aggfunc = np.mean).iloc[:, :4]

mean_price = pd.pivot_table(index = temp_df['minimum_nights_bins'], 
                            columns = temp_df['room_type'], 
                            data = temp_df,
                            aggfunc = np.mean).iloc[:, 4:]

In [None]:
plt.figure(figsize = (15, 5))
sns.heatmap(min_nights, annot = True, fmt = 'g')
plt.xticks(ticks = [0.5, 1.5, 2.5, 3.5], labels = ["Entire home/apt", 'Hotel room', 'Private room', 'Shared room'])
plt.xlabel('Room type')
plt.title('Average no of days for different type of room options found in listings')
plt.show()

In [None]:
plt.figure(figsize = (15, 5))
sns.heatmap(mean_price, annot = True, fmt = 'g')
plt.xticks(ticks = [0.5, 1.5, 2.5, 3.5], labels = ["Entire home/apt", 'Hotel room', 'Private room', 'Shared room'])
plt.xlabel('Room type')
plt.title('Average Price for different type of room options found in listings')
plt.show()

In [None]:
del temp_df

## last_review, number_of_reviews, reviews_per_month columns

In [None]:
temp_df = data[['number_of_reviews', 'last_review', 'reviews_per_month', 'price', 'room_type']]

In [None]:
temp_df = temp_df[~temp_df.isnull().any(1)]

In [None]:
temp_df['last_review'] = pd.to_datetime(temp_df['last_review'])

In [None]:
# Number of Reviews

fig = plt.figure(figsize = (16, 5))

fig.add_subplot(1,2,1)
sns.distplot(temp_df['number_of_reviews'])

fig.add_subplot(1,2,2)
sns.distplot(np.log1p(temp_df['number_of_reviews']))

plt.show()

In [None]:
temp_df['last_review_year'] = temp_df['last_review'].map(lambda x: x.year)
temp_df['last_review_month'] = temp_df['last_review'].map(lambda x: x.month)
temp_df['last_review_day'] = temp_df['last_review'].map(lambda x: x.day)
temp_df['last_review_dayofWeek'] = temp_df['last_review'].map(lambda x: x.dayofweek)
temp_df['last_review_dayofYear'] = temp_df['last_review'].map(lambda x: x.dayofyear)
temp_df['last_review_weekofYear'] = temp_df['last_review'].map(lambda x: x.weekofyear)
temp_df['last_review_week'] = temp_df['last_review'].map(lambda x: x.week)
temp_df['last_review_daysofMonth'] = temp_df['last_review'].map(lambda x: x.days_in_month)

In [None]:
# How number of reviews affect the price of listings
plt.figure(figsize = (16, 5))
sns.scatterplot(temp_df['number_of_reviews'], temp_df['price'])

- We cannot see much relation of number of reviews with respect to the price.

In [None]:
# Lets see if we can find out some pattern in the last review

In [None]:
year_mean = temp_df.groupby('last_review_year')['price'].agg(['count', 'min', 'max', 'mean', 'std', 'median'])
month_mean = temp_df.groupby('last_review_month')['price'].agg(['count', 'min', 'max', 'mean', 'std', 'median'])
day_mean = temp_df.groupby('last_review_day')['price'].agg(['count', 'min', 'max', 'mean', 'std', 'median'])
week_mean = temp_df.groupby('last_review_week')['price'].agg(['count', 'min', 'max', 'mean', 'std', 'median'])

In [None]:
fig = go.Figure(
    data = [
        go.Bar(x = year_mean.index, y = year_mean['mean'], name = 'Mean'),
        go.Bar(x = year_mean.index, y = year_mean['median'], name = 'Median'),
        go.Bar(x = year_mean.index, y = year_mean['std'], name = 'Std'),
    ]
)

fig.update_layout(barmode ='group', 
                  plot_bgcolor = 'white',
                  title = "Statistics of Price with respect to the Year of Last Review",
                  xaxis_title = "Year",
                  yaxis_title = 'Price',
                  legend_title = 'Statistics')
fig

In [None]:
fig = go.Figure(
    data = [
        go.Bar(x = week_mean.index, y = week_mean['mean'], name = 'Mean'),
        go.Bar(x = week_mean.index, y = week_mean['median'], name = 'Median'),
        go.Bar(x = week_mean.index, y = week_mean['std'], name = 'Std'),
    ]
)

fig.update_layout(barmode ='group', 
                  plot_bgcolor = 'white',
                  title = "Statistics of Price with respect to the week of the Year of Last Review",
                  xaxis_title = "Week",
                  yaxis_title = 'Price',
                  legend_title = 'Statistics')
fig

In [None]:
fig = go.Figure(
    data = [
        go.Bar(x = month_mean.index, y = month_mean['mean'], name = 'Mean'),
        go.Bar(x = month_mean.index, y = month_mean['median'], name = 'Median'),
        go.Bar(x = month_mean.index, y = month_mean['std'], name = 'Std'),
    ]
)

fig.update_layout(barmode ='group', 
                  plot_bgcolor = 'white',
                  title = "Statistics of Price with respect to the Month of the Year of Last Review",
                  xaxis_title = "Month",
                  yaxis_title = 'Price',
                  legend_title = 'Statistics')
fig

In [None]:
fig = go.Figure(
    data = [
        go.Bar(x = day_mean.index, y = day_mean['mean'], name = 'Mean'),
        go.Bar(x = day_mean.index, y = day_mean['median'], name = 'Median'),
        go.Bar(x = day_mean.index, y = day_mean['std'], name = 'Std'),
    ]
)

fig.update_layout(barmode ='group', 
                  plot_bgcolor = 'white',
                  title = "Statistics of Price with respect to the Day of the Month of Last Review",
                  xaxis_title = "Day",
                  yaxis_title = 'Price',
                  legend_title = 'Statistics')
fig

- After looking at the above figures there not much of a pattern, the reviews are given at random.

In [None]:
rooms = temp_df.groupby('room_type')['number_of_reviews'].agg(['count', 'mean', 'median'])

In [None]:
from plotly.subplots import make_subplots

labels = rooms.index.tolist()
colors = ['gold', 'mediumturquoise', 'darkorange', 'lightgreen']

fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(labels=labels, values=rooms['count'].values.tolist(), name="Counts", hole = 0.3),
              1, 1)
fig.add_trace(go.Pie(labels=labels, values=rooms['mean'].values.tolist(), name="Mean Values", hole = 0.3),
              1, 2)

fig.update_layout(title = "Percentage Share of Room Types", 
                  legend_title = "Room Type",
                  annotations=[
                      dict(text='Count', x=0.18, y=0.5, font_size=20, showarrow=False),
                      dict(text='%age', x=0.82, y=0.5, font_size=20, showarrow=False)]
                 )

fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='#000000', width=2)))

- The above plots show the scenario of distribution of room types and the average percentage of reviews given to the room types.
- Although the "Entire home/apt" has 69% of share in terms of records count, only 45.56% of all people have reviewed for them.
- The percentage of reviews given for the room types does not depend on the no of records present for that perticular room type

## reviews_per_month column

In [None]:
fig = plt.figure(figsize = (16, 5))

fig.add_subplot(1,2,1)
sns.distplot(temp_df['reviews_per_month'])

fig.add_subplot(1,2,2)
sns.distplot(np.log1p(temp_df['reviews_per_month']))

In [None]:
fig = plt.figure(figsize = (16, 5))

fig.add_subplot(1,2,1)
sns.boxplot(temp_df['reviews_per_month'])

fig.add_subplot(1,2,2)
sns.violinplot(temp_df['reviews_per_month'])

In [None]:
fig = plt.figure(figsize = (16, 5))

fig.add_subplot(1,2,1)
sns.boxplot(x = temp_df['room_type'], y = temp_df['reviews_per_month'])

fig.add_subplot(1,2,2)
sns.violinplot(x = temp_df['room_type'], y = temp_df['reviews_per_month'])

In [None]:
plt.figure(figsize = (16, 5))
sns.scatterplot(temp_df['reviews_per_month'], temp_df['price'])
plt.show()

## availability_365 column

In [None]:
temp_df = data[['host_id', 'room_type', 'minimum_nights', 'availability_365', 'price']]

In [None]:
temp_df['not_availability_365'] = 365 - temp_df['availability_365']

In [None]:
# let us consider that the feature "available_365" means that out of 365 days how many days the listing is available
# there are many listings which are not available throughout the year

In [None]:
not_available = temp_df[(temp_df['availability_365'] == 0) & (temp_df['minimum_nights'] > 0)]
available = temp_df[temp_df['availability_365'] > 0]

In [None]:
labels = not_available['room_type'].value_counts().index.tolist()
values = not_available['room_type'].value_counts().values.tolist()

colors = ['gold', 'mediumturquoise', 'darkorange', 'lightgreen']
fig = go.Figure(
    data = [
        go.Pie(labels = labels, values = values, hole = 0.3)
    ]
)
fig.update_layout(title = "Percentage Share of Room Types which are not available throughout Year", 
                  legend_title = "Room Type")
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.show()

In [None]:
rooms = available.groupby('room_type')['availability_365'].agg(['count', 'mean', 'median', 'std'])

In [None]:
fig = go.Figure(
    data = [
        go.Bar(x = rooms.index, y = rooms['mean'], name = 'Mean'),
        go.Bar(x = rooms.index, y = rooms['median'], name = 'Median'),
        go.Bar(x = rooms.index, y = rooms['std'], name = 'Std'),
    ]
)

fig.update_layout(barmode ='group', 
                  plot_bgcolor = 'white',
                  title = "Statistics of Availability with respect to the Room type options",
                  xaxis_title = "Room Type",
                  yaxis_title = 'Availability',
                  legend_title = 'Statistics')
fig

In [None]:
fig = plt.figure(figsize = (16, 5))

fig.add_subplot(1,2,1)
sns.boxplot(x = temp_df['room_type'], y = temp_df['availability_365'])

fig.add_subplot(1,2,2)
sns.violinplot(x = temp_df['room_type'], y = temp_df['availability_365'])

In [None]:
corr = temp_df[['minimum_nights', 'availability_365', 'price']].corr()
plt.figure(figsize = (8,5))
sns.heatmap(corr, annot = True, fmt = 'g')
plt.yticks(rotation = 0)
plt.show()

- Both minimum_nights and availability_365 are not much useful with respect to price

## city column

In [None]:
# Lets see how city affects the price of listings

In [None]:
temp_df = data[['neighbourhood', 'room_type', 'city', 'price']]

In [None]:
city = temp_df['city'].value_counts()

plt.figure(figsize = (16,5))
sns.barplot(city.index, city.values)
plt.xticks(rotation = 90)
plt.xlabel('City')
plt.ylabel("Frequency")
plt.title("Frequency distribution of City over all the records")
plt.show()

In [None]:
plt.figure(figsize = (22, 4))
sns.heatmap(pd.crosstab(temp_df['room_type'], temp_df['city']), annot = True, fmt = 'd')

- Most of the records belong to New York City, Los Angeles, Hawaii, but within them most records tends to be for Entire Home/Apt and Private room.

In [None]:
# lets see the average price of different options available at different cities

In [None]:
plt.figure(figsize = (12, 10))
pivot = pd.pivot_table(index = temp_df['city'], columns = temp_df['room_type'], aggfunc = np.mean, data = temp_df)
sns.heatmap(pivot, annot = True, fmt = 'g')
plt.xticks(ticks = [0.5, 1.5, 2.5, 3.5], labels = ["Entire home/apt", 'Hotel room', 'Private room', 'Shared room'])
plt.show()

In [None]:
# Price distribution among the cities in Overall Dataset
city = temp_df.groupby('city')['price'].agg(['mean', 'median', 'std'])

In [None]:
fig = go.Figure(
    data = [
        go.Bar(x = city.index, y = city['mean'], name = 'Mean'),
        go.Bar(x = city.index, y = city['median'], name = 'Median'),
        go.Bar(x = city.index, y = city['std'], name = 'Std'),
    ]
)

fig.update_layout(barmode ='group', 
                  plot_bgcolor = 'white',
                  title = "Statistics of Price with respect to the City",
                  xaxis_title = "City",
                  yaxis_title = 'Price',
                  legend_title = 'Statistics')
fig

In [None]:
temp_df['imp_areas'] = temp_df['neighbourhood'].apply(return_areas)

In [None]:
pivot = pd.pivot_table(data = temp_df, index = temp_df['city'], columns = temp_df['imp_areas'], aggfunc = np.mean)
plt.figure(figsize = (12, 10))
sns.heatmap(pivot, annot = True, fmt = 'g')
plt.xticks(ticks = [0.5, 1.5, 2.5, 3.5, 4.5], labels = [0, 1, 2, 3, 4])
plt.xlabel("Neighbourhood rating")
plt.ylabel("City")
plt.show()

- Los Angeles with the best neighbourhood rating has the highest average pricing among all the listings