
This Jupyter Notebook is environment where perform different EDA techniques and training models
The Notebook follows below procdeure
1. Get Dataset
2. Split the Dataset
3. Feature Engineering and Pipeline Creation 
4. Model and Inference Pipeline
5. Train

In [2]:
import json
import yaml
import wandb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline , make_pipeline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, FunctionTransformer


1. Download Dataset

In [3]:
# Download the dataset from w&B
# So this Dataset needs to be in W&B
import traceback
try :
    run = wandb.init(project = "nyc_airbnb" , group = "modeling" , save_code = True)
    input_path = wandb.use_artifact("sample.csv:latest").file()
    df = pd.read_csv(input_path)
except:
    traceback.print_exc()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdsri238[0m ([33mdsri238-s-p-global[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,9138664,Private Lg Room 15 min to Manhattan,47594947,Iris,Queens,Sunnyside,40.74271,-73.92493,Private room,74,2,6,2019-05-26,0.13,1,5
1,31444015,TIME SQUARE CHARMING ONE BED IN HELL'S KITCHEN...,8523790,Johlex,Manhattan,Hell's Kitchen,40.76682,-73.98878,Entire home/apt,170,3,0,,,1,188
2,8741020,Voted #1 Location Quintessential 1BR W Village...,45854238,John,Manhattan,West Village,40.73631,-74.00611,Entire home/apt,245,3,51,2018-09-19,1.12,1,0
3,34602077,Spacious 1 bedroom apartment 15min from Manhattan,261055465,Regan,Queens,Astoria,40.76424,-73.92351,Entire home/apt,125,3,1,2019-05-24,0.65,1,13
4,23203149,Big beautiful bedroom in huge Bushwick apartment,143460,Megan,Brooklyn,Bushwick,40.69839,-73.92044,Private room,65,2,8,2019-06-23,0.52,2,8


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20000 non-null  int64  
 1   name                            19993 non-null  object 
 2   host_id                         20000 non-null  int64  
 3   host_name                       19992 non-null  object 
 4   neighbourhood_group             20000 non-null  object 
 5   neighbourhood                   20000 non-null  object 
 6   latitude                        20000 non-null  float64
 7   longitude                       20000 non-null  float64
 8   room_type                       20000 non-null  object 
 9   price                           20000 non-null  int64  
 10  minimum_nights                  20000 non-null  int64  
 11  number_of_reviews               20000 non-null  int64  
 12  last_review                     

In [6]:
# Fdeatures and Targerts 
X = df
y = df.pop("price")
X_train , y_train, X_test , y_test = train_test_split(X , y , test_size= 0.3 , stratify=X["neighbourhood_group"]
                                                      ,random_state = 42)

Feature Engineering and Processing Pipeline

In [7]:
ord_cat_columns = ["room_type"]
non_ord_cat_columns = ["neighbourhood_group"]

In [8]:
non_ord_cat_col_preprocessing = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder()
)

In [9]:
ord_cat_columns_preprocessing = OrdinalEncoder()

In [10]:
# Let's handle Numerical columns to handle missing Values
zero_imputed = ['latitude' ,
                'longitude',
                'minimum_nights',
                'number_of_reviews', 
                'reviews_per_month', 
                'calculated_host_listings_count', 
                'availability_365' ]

zero_imputer_preproc = SimpleImputer(strategy="constant" , fill_value=0)

In [11]:
def calculate_date_deltas(dates):
    dates_sanitized = pd.DataFrame(dates).apply(pd.to_datetime)
    delta_days = dates_sanitized.apply(lambda d: (d.max() - d).dt.days , axis = 0)
    return delta_days.to_numpy()

In [12]:
date_imputer = make_pipeline(
    SimpleImputer(strategy='constant' , fill_value='2010-01-01'),
    FunctionTransformer(calculate_date_deltas , check_inverse= False , validate= False)
)

In [13]:
# Get the number manuly fro config.yaml
max_tfidf_features = 5
reshape_1d = FunctionTransformer(lambda x: np.reshape(x, -1))
tfidf_preprocessing = make_pipeline(
    SimpleImputer(strategy="constant" , fill_value=""),
    reshape_1d,
    TfidfVectorizer(binary= False,
                    max_features= max_tfidf_features,
                    stop_words='english')
    )

In [14]:
# create Pipeline
preprocessor = ColumnTransformer(
    transformers =[
        ("ordinal_cat" , ord_cat_columns_preprocessing , ord_cat_columns  ),
        ("non_ordinal_cat" , non_ord_cat_col_preprocessing , non_ord_cat_columns  ),
        ("impute_zero" , zero_imputer_preproc , zero_imputed  ),
        ("date_trnasform" , date_imputer , ["last_review"]  ),
         ("transform_name" , tfidf_preprocessing , ["name"]  )
    ],
    remainder = "drop"
)

In [15]:
features_processed = ord_cat_columns + non_ord_cat_columns + zero_imputed +  ["last_review", "name"] 

In [16]:
features_processed

['room_type',
 'neighbourhood_group',
 'latitude',
 'longitude',
 'minimum_nights',
 'number_of_reviews',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365',
 'last_review',
 'name']

Model and Inference Pipeline


In [17]:
config = dict()
with open("../../config.yaml") as fp:
    config = yaml.safe_load(fp)

randomforest_config = dict(config["modeling"]["random_forest"].items())
randomforest_config["random_state"] = 42


In [18]:
randomforest_config

{'n_estimators': 100,
 'max_depth': 15,
 'min_samples_split': 4,
 'min_samples_leaf': 3,
 'n_jobs': -1,
 'criterion': 'squared_error',
 'max_features': 0.5,
 'oob_score': True,
 'random_state': 42}

In [19]:
random_forest = RandomForestRegressor(**randomforest_config)

In [20]:
sklearn_pipe = Pipeline(
    steps = [
        ("preprocessor" , preprocessor),
        ("regressor" , random_forest)
    ]
)

Train

In [22]:
parameters_grid = {
    'regressor__n_estimators' : [100,150, 200],
    'regressor__max_features' : [0.1,0.33,0.5 , 0.75, 1.0],
    'regressor__max_depth' : [n for n in range(5, 20, 5)]
}

search = GridSearchCV(estimator= sklearn_pipe ,
                      param_grid = parameters_grid ,
                      cv = 3,
                      scoring = 'neg_mean_absolute_error')

try:
    search.fit(X, y)
except:
    traceback.print_exc()

random_forest_pipe = search.best_estimator_

print('Best score : MAE = \n' , search.best_score_)
print('\n Best Params:\n' , search.best_params_)

Best score : MAE = 
 -66.94805775804679

 Best Params:
 {'regressor__max_depth': 15, 'regressor__max_features': 0.33, 'regressor__n_estimators': 200}
