# Data Modelling

## Importing the libraries

In [102]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Importing the dataset

In [103]:
df=pd.read_csv('data_pre_modelling.csv', low_memory=False)

In [104]:
df.shape

(225907, 81)

In [105]:
df.reviews_per_month.isna().sum()

48532

In [106]:
df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood', 'latitude',
       'longitude', 'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'city_Asheville', 'city_Austin', 'city_Boston',
       'city_Broward County', 'city_Cambridge', 'city_Chicago',
       'city_Clark County', 'city_Columbus', 'city_Denver', 'city_Hawaii',
       'city_Jersey City', 'city_Los Angeles', 'city_Nashville',
       'city_New Orleans', 'city_New York City', 'city_Oakland',
       'city_Pacific Grove', 'city_Portland', 'city_Rhode Island',
       'city_Salem', 'city_San Clara Country', 'city_San Diego',
       'city_San Francisco', 'city_San Mateo County', 'city_Santa Cruz County',
       'city_Seattle', 'city_Twin Cities MSA', 'city_Washington D.C.',
       'room_type_Entire home/apt', 'room_type_Hotel room',
       'room_type_Private room', 'room_type_Shared room',
       'neighbourhood_group_Ballard', 'neighb

In [107]:
df2 = df.drop(columns=['id', 'name','host_id','host_name','last_review','neighbourhood','longitude','latitude'])

In [108]:
dfd = df2.isna().sum()
pd.set_option('display.max_rows', None)
print(dfd)

minimum_nights                                  0
number_of_reviews                               0
reviews_per_month                           48532
calculated_host_listings_count                  0
availability_365                                0
city_Asheville                                  0
city_Austin                                     0
city_Boston                                     0
city_Broward County                             0
city_Cambridge                                  0
city_Chicago                                    0
city_Clark County                               0
city_Columbus                                   0
city_Denver                                     0
city_Hawaii                                     0
city_Jersey City                                0
city_Los Angeles                                0
city_Nashville                                  0
city_New Orleans                                0
city_New York City                              0


In [109]:
X=df2.iloc[:,:-1].values
y=df2.iloc[:,-1].values

### Imputation using Mean for 'reviews_per_month'

In [110]:
index_no = df2.columns.get_loc('reviews_per_month')
print(index_no)

2


In [111]:
xrpm = X[:,index_no].reshape(-1, 1)

In [112]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(xrpm)
xrpm=imputer.transform(xrpm)

In [113]:
X[:,index_no]=xrpm.reshape(1,-1)

In [114]:
X[:,index_no]

array([1.14      , 1.03      , 0.81      , ..., 1.43152958, 1.43152958,
       1.43152958])

## Splitting the dataset into the Training set and Test set

In [115]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Multiple Linear Regression model on the Training set

In [116]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

LinearRegression()

In [117]:
y_pred = regressor.predict(X_test)

In [118]:
y_pred

array([145.2795166 , 571.66962965, 293.35515863, ...,  93.20314196,
       552.75159202, 175.12595579])

In [119]:
y_test

array([  72, 1950,  183, ...,   85,   60,  131], dtype=int64)

In [123]:
y_pred = regressor.predict(X_test)

from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred, squared=False)

505.6940061734732

In [126]:
df2.price.mean()

219.76152133400026

## Training the Decision Tree Regression model on the whole dataset

In [127]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X,y)

DecisionTreeRegressor(random_state=0)

In [128]:
y_pred = regressor.predict(X_test)

from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred, squared=False)

196.94244045773362

## Training the Random Forest Regression model on the whole dataset

In [129]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=10, random_state=0)
regressor.fit(X,y)

RandomForestRegressor(n_estimators=10, random_state=0)

In [130]:
y_pred = regressor.predict(X_test)

from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred, squared=False)

263.32937330451705

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators':range(10,300,10), 'criterion':('mse','mae'), 'max_features':('auto','sqrt','log2')}

rf = RandomForestRegressor()

gs = GridSearchCV(rf,parameters,scoring='neg_mean_absolute_error',cv=3)
gs.fit(X_train,y_train)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
