In [34]:
import pandas as pd
import numpy as np
df = pd.read_csv("../data/raw_data.csv")


In [35]:
categorical_col = []
numerical_col = []
for column in df.columns:
    
    if df[column].dtypes != "float64" and df[column].dtypes != "int64":
        categorical_col.append(column)
    else:
        numerical_col.append(column)

In [36]:
columns_null = df.isnull().sum()
columns_null = columns_null[columns_null > 0]
columns_null

bathrooms                   200
first_review              15864
host_has_profile_pic        188
host_identity_verified      188
host_response_rate        18299
host_since                  188
last_review               15827
neighbourhood              6872
review_scores_rating      16722
thumbnail_url              8216
zipcode                     966
bedrooms                     91
beds                        131
dtype: int64

In [39]:
numcolumns = df[numerical_col].isnull().sum()
numcolumns_null = numcolumns[numcolumns > 0]
numcolumns_null

bathrooms                 200
review_scores_rating    16722
bedrooms                   91
beds                      131
dtype: int64

In [40]:
df[numcolumns_null.index] = df[numcolumns_null.index].fillna(df[numcolumns_null.index].median())

first_review              15864
host_has_profile_pic        188
host_identity_verified      188
host_response_rate        18299
host_since                  188
last_review               15827
neighbourhood              6872
thumbnail_url              8216
zipcode                     966
dtype: int64

In [41]:
for i in categorical_col:
    mostfrequent = df[i].value_counts().idxmax()
    df[i] = df[i].fillna(mostfrequent)


In [42]:
df.dtypes

id                          int64
log_price                 float64
property_type              object
room_type                  object
amenities                  object
accommodates                int64
bathrooms                 float64
bed_type                   object
cancellation_policy        object
cleaning_fee                 bool
city                       object
description                object
first_review               object
host_has_profile_pic       object
host_identity_verified     object
host_response_rate         object
host_since                 object
instant_bookable           object
last_review                object
latitude                  float64
longitude                 float64
name                       object
neighbourhood              object
number_of_reviews           int64
review_scores_rating      float64
thumbnail_url              object
zipcode                    object
bedrooms                  float64
beds                      float64
dtype: object

In [45]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in categorical_col:
    df[col] = le.fit_transform(df[col])

In [53]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, mean_squared_error
models = ["Linear Regression","DTreeReg","RandomForest","GradientBoosting"]

x = df.drop(["id","name","log_price","description","first_review","host_since","last_review","neighbourhood",
            "thumbnail_url", "zipcode"],axis = 1)
y = df["log_price"]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)


    

In [54]:
for i in models:
    if i == "Linear Regression":
        model = LinearRegression()
        model.fit(x_train,y_train)
        ypred = model.predict(x_test)

    elif i == "DTreeReg":
        param_grid = {
            'max_depth': [2, 4, 10, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
        model = DecisionTreeRegressor(
            random_state=42
        )  
        grid = GridSearchCV(estimator=model,param_grid=param_grid,cv=5)
        grid.fit(x_train,y_train)
        ypred = grid.predict(x_test)
    
    elif i == "RandomForest":
        param_grid = {
            'n_estimators': [50, 100],
            'max_depth': [2, 4, 8],
            'min_samples_split': [2, 5]
        }
        model = RandomForestRegressor(random_state=42, n_jobs=-1)
        grid = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=5, cv=3, n_jobs=-1)
        grid.fit(x_train,y_train)
        ypred = grid.predict(x_test)
    
    else:
        param_grid = {
            'n_estimators': [100, 200],
            'learning_rate': [0.05, 0.1],
            'max_depth': [3, 5],
            'subsample': [0.8, 1.0]
        }
        model = GradientBoostingRegressor(random_state=42)
        grid = GridSearchCV(model,param_grid=param_grid,n_jobs=1)
        grid.fit(x_train,y_train)
        ypred=grid.predict(x_test)
        
    mae_rf = mean_absolute_error(y_test, ypred)
    mse_rf = mean_squared_error(y_test, ypred)
    rmse_rf = np.sqrt(mean_squared_error(y_test, ypred))
    r2_rf = r2_score(y_test, ypred)

    print(i)
    print('\nMean Absolute Error of Random Forest Regressor     : ', mae_rf)
    print('\nMean Squarred Error of Random Forest Regressor     : ', mse_rf)
    print('\nRoot Mean Squarred Error of Random Forest Regressor: ', rmse_rf)
    print('\nR2 Score of Random Forest Regressor                : ', r2_rf)
    

Linear Regression

Mean Absolute Error of Random Forest Regressor     :  0.36798779467839776

Mean Squarred Error of Random Forest Regressor     :  0.23867647213115062

Root Mean Squarred Error of Random Forest Regressor:  0.4885452610875994

R2 Score of Random Forest Regressor                :  0.5354011296499173
DTreeReg

Mean Absolute Error of Random Forest Regressor     :  0.3133797951278926

Mean Squarred Error of Random Forest Regressor     :  0.1858034262506118

Root Mean Squarred Error of Random Forest Regressor:  0.4310492155782351

R2 Score of Random Forest Regressor                :  0.6383218623418614
RandomForest

Mean Absolute Error of Random Forest Regressor     :  0.31168641726640994

Mean Squarred Error of Random Forest Regressor     :  0.18070110784881935

Root Mean Squarred Error of Random Forest Regressor:  0.4250895292156928

R2 Score of Random Forest Regressor                :  0.6482538482827993
GradientBoosting

Mean Absolute Error of Random Forest Regressor    