In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/us-airbnb-open-data/AB_US_2020.csv')

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include='object')

In [None]:
# Missing Data
import missingno as msno

msno.matrix(df);

In [None]:
df.head()

In [None]:
df.price.value_counts()

In [None]:
filt = (df.price == 0) | (df.price == 1)
df[filt]

In [None]:
# exclude price with $1 or 0

df = df[~filt]

In [None]:
df.availability_365.value_counts()

In [None]:
# drop apartment or house are not available (57334 rows)

filt = df.availability_365 != 0
df = df[filt]

In [None]:
df.room_type.value_counts()

In [None]:
df.city.value_counts()

In [None]:
df.minimum_nights.value_counts()

In [None]:
# set minimum nights max of 30 days in this analysis

filt = df.minimum_nights >= 31
df = df[~filt]

In [None]:
df.price.max()

In [None]:
import matplotlib.pyplot as plt

%matplotlib inline

# Get the label column
label = df['price']

# Create a figure for 2 subplots (2 rows, 1 column)
fig, ax = plt.subplots(2, 1, figsize = (9,12))

# Plot the histogram   
ax[0].hist(label, bins=100)
ax[0].set_ylabel('Frequency')

# Add lines for the mean, median, and mode
ax[0].axvline(label.mean(), color='magenta', linestyle='dashed', linewidth=2)
ax[0].axvline(label.median(), color='cyan', linestyle='dashed', linewidth=2)

# Plot the boxplot   
ax[1].boxplot(label, vert=False)
ax[1].set_xlabel('$')

# Add a title to the Figure
fig.suptitle('Price/night')

# Show the figure
fig.show()



In [None]:
filt = df.price > 1000
df[filt].price.count()

In [None]:
df = df[~filt]

In [None]:
df.info()

In [None]:
# Get the label column
label = df['price']

# Create a figure for 2 subplots (2 rows, 1 column)
fig, ax = plt.subplots(2, 1, figsize = (9,12))

# Plot the histogram   
ax[0].hist(label, bins=100)
ax[0].set_ylabel('Frequency')

# Add lines for the mean, median, and mode
ax[0].axvline(label.mean(), color='magenta', linestyle='dashed', linewidth=2)
ax[0].axvline(label.median(), color='cyan', linestyle='dashed', linewidth=2)

# Plot the boxplot   
ax[1].boxplot(label, vert=False)
ax[1].set_xlabel('$')

# Add a title to the Figure
fig.suptitle('Price/night')

# Show the figure
fig.show()

In [None]:
# Plot a histogram for each numeric feature

numeric_features = ['minimum_nights','number_of_reviews','availability_365']

for col in numeric_features:
    fig = plt.figure(figsize=(9, 6))
    ax = fig.gca()
    feature = df[col]
    feature.hist(bins=100, ax = ax)
    ax.axvline(feature.mean(), color='magenta', linestyle='dashed', linewidth=2)
    ax.axvline(feature.median(), color='cyan', linestyle='dashed', linewidth=2)
    ax.set_title(col)
plt.show()

In [None]:
df.columns

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X = df.loc[:,['room_type','minimum_nights','number_of_reviews','availability_365','city']]
y = df.price

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=38)

In [None]:
X_train.shape,y_train.shape

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder


numeric_features = ['minimum_nights','number_of_reviews','availability_365']
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

categorical_features = ['room_type','city']
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
model = Pipeline(steps=[('preprocessor', preprocessor),
                  ('classifier', LinearRegression())])  

In [None]:
model.fit(X_train,y_train)

In [None]:
predictions = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, predictions)
print("MSE:", mse)

rmse = np.sqrt(mse)
print("RMSE:", rmse)

r2 = r2_score(y_test, predictions)
print("R2:", r2)

In [None]:
plt.scatter(y_test, predictions)
plt.xlabel('Actual Labels')
plt.ylabel('Predicted Labels')
# overlay the regression line
z = np.polyfit(y_test, predictions, 1)
p = np.poly1d(z)
plt.plot(y_test,p(y_test), color='red')
plt.show()

### Comment:
* 5 features were selected namely 'minimum_nights','number_of_reviews','availability_365','room_type','city'  
* 3 numericals features : 'minimum_nights','number_of_reviews','availability_365'
* 2 categorical features : 'room_type','city'
* The target is price. Outliers in which prices more than 1000 were removed. Prices of 0 & 1 were removed as well.
* Miniumum nights were limited to 30 days ( removed others unit more than 30 days)
* Linear Regression model gives RMSE 143.48 & r2: 0.1755

### Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = Pipeline(steps=[('preprocessor', preprocessor),
                  ('classifier', RandomForestRegressor())])  

In [None]:
model.fit(X_train,y_train)

In [None]:
predictions = model.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, predictions)
print("MSE:", mse)

rmse = np.sqrt(mse)
print("RMSE:", rmse)

r2 = r2_score(y_test, predictions)
print("R2:", r2)

### Comment:
* Random Forest Regression model gives RMSE 143.76 & r2: 0.1722
* Further hyperparameter tuning could be done.