# Airbnb Price Prediction Challenge
- Mohamed Mokhtar,

# **Data Downloading**

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.rcParams['axes.titlesize'] = 24
plt.rcParams['axes.labelsize'] = 16
plt.rcParams['lines.linewidth'] = 3
plt.rcParams['lines.markersize'] = 10
plt.rcParams['ytick.labelsize'] = 16
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams["figure.figsize"] = '12', '8'
#plt.xticks(rotation=85)


# **Data Reading**

In [None]:
data = pd.read_csv("../input/airbnb-price-prediction/train.csv")
data.head()

# **Data Exploration and Visualization**

In [None]:
data.info()

In [None]:
# Prinintg all unique values of the objects data types in the dataset
for column in data.columns:
  if (data[column].dtype) == 'object' and column != 'zipcode' and column != 'neighbourhood':  
    print(column,"\n--------------------------")
    print(data[column].unique())
    print("\n------------------------------")

In [None]:
def bar_plot_object(title,data):
  x = list(data.value_counts().keys())
  x = x[::-1]
  y = list(data.value_counts())
  y = y[::-1]
  plt.barh(x,y)
  plt.title(title)
  return None

In [None]:
def explore_col(column):
  bar_plot_object(column+" histogram",data[column])
  print(column,'description\n', data[column].describe())
  print('Top 5 frequent:',data[column].value_counts()[:5].index.tolist())

In [None]:
def pie_col(column):
  data[column].value_counts().plot.pie(autopct='%1.0f%%')

In [None]:
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 12
import matplotlib as mpl
mpl.rcParams['font.size'] = 20
explore_col('property_type')

In [None]:
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 17
explore_col('room_type')

In [None]:
explore_col('bed_type')

In [None]:
explore_col('cancellation_policy')

In [None]:
pie_col('cancellation_policy')

In [None]:
explore_col('city')

In [None]:
pie_col('city')

In [None]:
plt.rcParams['xtick.labelsize'] = 16

#explore_col('neighbourhood')
#pie_col('neighbourhood')
neighbourhood_morethan_1000 = data.neighbourhood.value_counts()
neighbourhood_morethan_1000 [neighbourhood_morethan_1000 >1000].plot.pie(autopct='%1.0f%%')

In [None]:
neighbourhood_morethan_1000 [neighbourhood_morethan_1000 >1000].plot.bar()

In [None]:
def plt_density(column,title=None):
  if title==None:
    title=column
  plt.figure(figsize = (15, 8))
  sns.distplot(data[column])
  plt.title(title)
  plt.show()
  return

In [None]:
years =  pd.to_datetime(data['host_since'], format='%Y-%m-%d', errors='coerce').dt.year
years.value_counts().plot.bar()

In [None]:
years =  pd.to_datetime(data['first_review'], format='%Y-%m-%d', errors='coerce').dt.year
years.value_counts().plot.bar()

In [None]:
years =  pd.to_datetime(data['last_review'], format='%Y-%m-%d', errors='coerce').dt.year
years.value_counts().plot.bar()

In [None]:
years_of_host_created = pd.DataFrame({
    'year of host created':pd.to_datetime(data['host_since'], format='%Y-%m-%d', errors='coerce').dt.year.fillna(0),
    'log_price': data['log_price']
})
plt.figure(figsize=(12,4))

sns.countplot(x="year of host created", data=years_of_host_created)
plt.title('Host since')
plt.show()

In [None]:
years_of_first_review = pd.DataFrame({
    'year of first review':pd.to_datetime(data['first_review'], format='%Y-%m-%d', errors='coerce').dt.year.fillna(0),
    'log_price': data['log_price']
})
plt.figure(figsize=(12,4))

sns.countplot(x="year of first review", data=years_of_first_review)
plt.title('Years of first review')
plt.show()

In [None]:
years_of_last_review = pd.DataFrame({
    'year of last review':pd.to_datetime(data['last_review'], format='%Y-%m-%d', errors='coerce').dt.year.fillna(0),
    'log_price': data['log_price']
})
plt.figure(figsize=(12,4))

sns.countplot(x="year of last review", data=years_of_last_review)
plt.title('Years of last review')
plt.show()

In [None]:
data.accommodates.value_counts().plot.bar()

In [None]:
data.bathrooms.value_counts().plot.bar()

In [None]:
#explore_col('host_response_rate')
host_response_rate = data.host_response_rate.value_counts()
explode = (0.1, 0.2, 0.1)
host_response_rate [host_response_rate > 1000].plot.pie(shadow=True, startangle=90,autopct='%1.0f%%')

In [None]:
data.bedrooms.value_counts().plot.bar()

In [None]:
data.beds.value_counts().plot.bar()

In [None]:
plt_density('log_price')

In [None]:
data.log_price.describe()

In [None]:
data.review_scores_rating.plot.box()

In [None]:
data.number_of_reviews.plot()

In [None]:
pie_col('host_has_profile_pic')

In [None]:
pie_col('host_identity_verified')

In [None]:
pie_col('instant_bookable')

In [None]:
#Composite type attr.
amenities_col = []
amenities_map = {}
for s in data.amenities:
  s = s.replace('{','')
  s = s.replace('}','')
  s = s.replace('"','')
  s = s.split(',')
  amenities_col.append(max(len(s)-1,0))
  for k in s:
    if amenities_map.get(k) != None:
      amenities_map[k] +=1 
    else:
      amenities_map[k] = 1

In [None]:
sorted_values = sorted(amenities_map.values()) # Sort the values
sorted_dict = {}
for i in sorted_values:
    for k in amenities_map.keys():
        if amenities_map[k] == i:
            sorted_dict[k] = amenities_map[k]
            break

keys = sorted_dict.keys()

values = sorted_dict.values()

plt.xticks(fontsize=12)
n = 20
plt.barh(list(keys)[-n:], list(values)[-n:], align='center')


# **Pre-processing and Features Extraction**

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.isnull().sum().plot.barh()

In [None]:
cleaned_data = data.drop(['id','description','first_review','last_review','host_since','host_has_profile_pic','name','thumbnail_url','zipcode','neighbourhood'],axis=1)
cleaned_data.columns

In [None]:
# Those columns are needed to be processed
"""
 5   bathrooms               73911 non-null  float64
 10  host_identity_verified  73923 non-null  object 
 11  host_response_rate      55812 non-null  object 
 15  neighbourhood           67239 non-null  object 
 17  review_scores_rating    57389 non-null  float64
 18  bedrooms                74020 non-null  float64
 19  beds                    73980 non-null  float64
"""
cleaned_data.info()

In [None]:
# Numeric values fill with mean of each column of it
cleaned_data.bathrooms = cleaned_data.bathrooms.fillna(int(cleaned_data.bathrooms.mean()))
cleaned_data.bedrooms = cleaned_data.bedrooms.fillna(int(cleaned_data.bedrooms.mean()))
cleaned_data.beds = cleaned_data.beds.fillna(int(cleaned_data.beds.mean()))
cleaned_data.review_scores_rating = cleaned_data.review_scores_rating.fillna(cleaned_data.review_scores_rating.mean())

In [None]:
# Fixing host response rate and change its value to the correct data type
cleaned_data.host_response_rate = cleaned_data.host_response_rate.apply(lambda x: int(x[:len(x)-1])/100 if isinstance(x,str) else x)
cleaned_data.host_response_rate = cleaned_data.host_response_rate.fillna(cleaned_data.host_response_rate.mean())

In [None]:
# Changing all boolean objects to 0/1
cleaned_data.host_identity_verified = cleaned_data.host_identity_verified.apply(lambda x: True if x=='t' else False)

In [None]:
cleaned_data.instant_bookable = cleaned_data.instant_bookable.apply(lambda x: True if x=='t' else False)

In [None]:
cleaned_data.info()

In [None]:
# Factorization of categorical columns
"""
 1   property_type           74111 non-null  object # 35 <-----------------
 2   room_type               74111 non-null  object # 3 
 6   bed_type                74111 non-null  object # 5
 7   cancellation_policy     74111 non-null  object # 5
 9   city                    74111 non-null  object # 6
"""
pass

In [None]:
def col_price(column):
  plot_data = pd.DataFrame({
      'data': cleaned_data[column],
      'log_price': cleaned_data.log_price
  })
  sns.countplot(x="data", data=plot_data)
  plt.title(column+' with log price')
  plt.show()


  plt.figure(figsize=(12,8))
  g = sns.boxplot(data=plot_data,orient='v', x = 'data', y = 'log_price')
  plt.title(column+' with log price')
  plt.show()
  return g

In [None]:
col_price('room_type')

In [None]:
cleaned_data.room_type.unique()

In [None]:
cleaned_data.room_type = cleaned_data.room_type.apply(lambda x: 3 if x=='Entire home/apt' else 2 if x=='Private room' else 1)

In [None]:
col_price('bed_type')

In [None]:
cleaned_data.bed_type.unique()

In [None]:
cleaned_data.bed_type = cleaned_data.bed_type.apply(lambda x: 2 if x=='Real Bed' else 1)

In [None]:
col_price('cancellation_policy')

In [None]:
cleaned_data.cancellation_policy.unique()

In [None]:
cleaned_data.cancellation_policy = cleaned_data.cancellation_policy.apply(lambda x: 3 if x=='super_strict_60' else 2 if x=='super_strict_30' else 1)

In [None]:
col_price('city')

In [None]:
cleaned_data.city.unique()

In [None]:
cleaned_data = pd.concat([cleaned_data, pd.get_dummies(cleaned_data['city'], prefix='city')],axis=1)

In [None]:
cleaned_data = cleaned_data.drop(['city'],axis=1)

In [None]:
cleaned_data = cleaned_data.drop(['latitude',	'longitude'],axis=1)

In [None]:
cleaned_data.info()

In [None]:
cleaned_data['amenities_count'] = pd.Series(amenities_col)

In [None]:
cleaned_data = cleaned_data.drop(['amenities'], axis=1)

In [None]:
cleaned_data.review_scores_rating = cleaned_data.review_scores_rating/100

In [None]:
types = cleaned_data.property_type.unique()
pt_map = {}
for t in types:
  m = cleaned_data[cleaned_data.property_type == t]['log_price'].mean()
  pt_map[t] = m
sorted_values = sorted(pt_map.values()) # Sort the values
sorted_dict = {}
for i in sorted_values:
    for k in pt_map.keys():
        if pt_map[k] == i:
            sorted_dict[k] = pt_map[k]
            break
cleaned_data.property_type = cleaned_data.property_type.apply(lambda x : pt_map[x])

In [None]:
cleaned_data.info()

In [None]:
import plotly.figure_factory as ff
corrs = cleaned_data.corr()
figure = ff.create_annotated_heatmap(
    z=corrs.values,
    x=list(corrs.columns),
    y=list(corrs.index),
    annotation_text=corrs.round(2).values,
    showscale=True)
figure.show()

In [None]:
cleaned_data.corr()

In [None]:
## Removing correlated features and unneeded ones
#cleaned_data = cleaned_data.drop(['beds','bathrooms','bedrooms','instant_bookable','number_of_reviews','host_response_rate'],axis=1)

In [None]:
#cleaned_data = cleaned_data.drop(['host_response_rate',],axis=1)

In [None]:
import plotly.figure_factory as ff
corrs = cleaned_data.corr()
figure = ff.create_annotated_heatmap(
    z=corrs.values,
    x=list(corrs.columns),
    y=list(corrs.index),
    annotation_text=corrs.round(2).values,
    showscale=True)
figure.show()

# **Data associations and Insights**

In [None]:
import plotly.express as px
def create_map(city):
    price_view = lambda x: 'price: ' + str(round(x, 2))

    if city in {'NYC', 'LA'}:
        fraction = 0.2
    else:
        fraction = 0.6
        
    df_temp = data[data['city'] == city].sample(frac=fraction,random_state=101)
    df_temp['log_price'] = np.round(df_temp['log_price'], 2)
    
    fig = px.scatter_mapbox(df_temp, 
                            lat="latitude", 
                            lon="longitude", 
                            hover_data=["log_price"],
                            color='log_price', 
                            zoom=10)
    
    fig.update_layout(
            title = f'Airbnb prices in {city}',
            geo_scope='usa',
            width=1000, 
            height=600,
            mapbox_style="white-bg",
            mapbox_layers=[{
                 "below": 'traces',
                 "sourcetype": "raster",
                 "sourceattribution": "United States Geological Survey",
                 "source": ["https://basemap.nationalmap.gov/arcgis/rest/services/USGSImageryOnly/MapServer/tile/{z}/{y}/{x}"]
              }]
    )
    #fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
    #fig.update_geos(fitbounds="locations")
    fig.show()
    
#city = interact(lambda x: create_map(x), x=['NYC', 'LA', 'SF', 'DC', 'Chicago', 'Boston'])  #widget
create_map('NYC')

In [None]:
import pandas as pd
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

frequent_itemsets = apriori(cleaned_data[['cleaning_fee','host_identity_verified','instant_bookable']], min_support=0.07, use_colnames=True)
frequent_itemsets


In [None]:
property_types = data.property_type.unique()
for pt in property_types:
  s = data.property_type == pt
  print('Support of',pt,np.sum(s)/len(s))

In [None]:
arr = data.city.unique()
print(arr)
for ai in arr:
  s = data.city == ai
  print('Support of',ai,np.sum(s)/len(s))

# **Model selection and tuning and Results/Evaluations**

In [None]:
pp_data = cleaned_data

In [None]:
# Spliting the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() 
train, val  = train_test_split(pp_data, test_size=0.2)
X_train = train.drop(['log_price'], axis=1)
#X_train = scaler.fit_transform(X_train)
y_train = train.log_price
X_val = val.drop(['log_price'], axis=1)
#X_val = scaler.transform(X_val) 
y_val = val.log_price


In [None]:
def analysis(model, X_train, X_test, y_train, y_test):
    
    prediction_test = model.predict(X_test)
    prediction_train = model.predict(X_train)
        
    
    show_metrics(prediction_test, prediction_train, y_test, y_train)
    
    sns.regplot(x = y_test, y = prediction_test, fit_reg=False)
    plt.title('Prediction and real')
    plt.show()

    sns.distplot(y_test - prediction_test, bins = 50)
    plt.title('Error variance')
    plt.show()

In [None]:
from sklearn import metrics
from scipy import stats

def show_metrics(prediction_test, prediction_train, y_test, y_train):
    MAE = round(metrics.mean_absolute_error(y_test, prediction_test), 2)
    MSE = round(metrics.mean_squared_error(y_test, prediction_test), 2)
    RMSE = round(np.sqrt(metrics.mean_squared_error(y_test, prediction_test)), 2)
    RMSE_ratio_test= round(np.sqrt(metrics.mean_squared_error(y_test, prediction_test)) / np.mean(y_test),3)
    RMSE_ratio_train = round(np.sqrt(metrics.mean_squared_error(y_train, prediction_train)) / np.mean(y_train),3)
    R_2_test = round(metrics.explained_variance_score(y_test, prediction_test), 2)
    R_2_train = round(metrics.explained_variance_score(y_train, prediction_train), 2)
    
    metrics_data = pd.DataFrame(data = [MAE, MSE, RMSE, RMSE_ratio_test, 
                                    RMSE_ratio_train, R_2_test, R_2_train]).T
    
    metrics_data.columns = ['MAE', 'MSE', 'RMSE', 'RMSE_ratio_test', 
                                    'RMSE_ratio_train', 'R_2_test', 'R_2_train']
    display(metrics_data)

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfm = RandomForestRegressor(
          max_depth = 10,
          n_jobs = -1, 
          n_estimators = 10
)
rfm.fit(X_train, y_train)
analysis(
    model = rfm, 
    X_train = X_train,
    X_test = X_val,
    y_train = y_train,
    y_test =  y_val
)


In [None]:
import xgboost as xgb
from xgboost import plot_importance
from sklearn.metrics import  mean_squared_error, r2_score

# Running model a XGB Regressor
model = xgb.XGBRegressor()

# Fit the model on training data
model.fit(X_train, y_train)

# Predict
pred_train = model.predict(X_train)

# Validate
pred_val = model.predict(X_val)

print("\nTraining MSE:", round(mean_squared_error(y_train, pred_train),4))
print("Validation MSE:", round(mean_squared_error(y_val, pred_val),4))
print("\nTraining r2:", round(r2_score(y_train, pred_train),4))
print("Validation r2:", round(r2_score(y_val, pred_val),4))

In [None]:
plot_importance(model)

In [None]:

analysis(
    model = model, 
    X_train = X_train,
    X_test = X_val,
    y_train = y_train,
    y_test =  y_val
)


In [None]:
from yellowbrick.regressor import ResidualsPlot

visualizer = ResidualsPlot(model)

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_val, y_val)  # Evaluate the model on the test data

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
from sklearn.naive_bayes import MultinomialNB

model2 = LinearRegression()
model2.fit(X_train, y_train)
analysis(
    model = model2, 
    X_train = X_train,
    X_test = X_val,
    y_train = y_train,
    y_test =  y_val
)
