# Introduction
The purpose of this notebook is to create Custom Transformers to include this steps into a Pipeline.

## Set environment

In [240]:
# 
import pandas as pd
import numpy as np
import pingouin as pg
from statsmodels.stats.outliers_influence import variance_inflation_factor

# 
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

# warnings
import warnings
warnings.filterwarnings("ignore")

## Import data

In [234]:
listings = pd.read_csv('listings_detailed.csv')

In [235]:
keep_cols = ['minimum_nights', 'maximum_nights', 'host_is_superhost']
host_cols = [col for col in listings.columns if 'host' in col if col not in keep_cols]
night_cols = [col for col in listings.columns if 'nights' in col if col not in keep_cols]
other_cols = ['listing_url', 'scrape_id', 'last_scraped', 'name', 'description', 
              'picture_url', 'availability_30', 'availability_60', 'availability_90', 
              'first_review', 'last_review', 'license', 'number_of_reviews_ltm',
              'number_of_reviews_l30d', 'calendar_last_scraped', 'neighborhood_overview',
              'neighbourhood', 'has_availability', 'amenities', 'host_id', 'id', 'property_type']

# empty columns
mask_empty_cols = listings.isnull().sum() == len(listings)
empty_cols = listings.columns[mask_empty_cols].to_list()

# columns to drop
cols_drop = host_cols + other_cols + night_cols + empty_cols

# drop columns
listings.drop(columns=cols_drop, inplace=True)

In [236]:
X = listings.drop(columns='price')
y = listings['price'].str.replace(',', '').str.slice(start=1).apply(lambda x: pd.to_numeric(x, errors='coerce'))

In [237]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=.3, random_state=42)

print('X train and test sets shapes: ' + str(train_X.shape), str(test_X.shape), 
      '\ny train and test sets shapes: ' + str(train_y.shape), str(test_y.shape))

# reset index for further preprocessing
train_X.reset_index(drop=True, inplace=True)
train_y.reset_index(drop=True, inplace=True)
test_X.reset_index(drop=True, inplace=True)
test_y.reset_index(drop=True, inplace=True)

X train and test sets shapes: (3702, 22) (1587, 22) 
y train and test sets shapes: (3702,) (1587,)


As Sklearn does not allow to modify target variables, we will perform outlier removal as a preprocessing step before fitting the Pipeline. We will see how the model performs with outliers present or absent in the data. We will also implement standardization to minimize the impact of outliers in the model. In any case, outliers will be only removed from the training set.

In [207]:
def iqr_outlier_removal(train_X, train_y):
    """
    Compute Tukey's Interquantile Range (IQR). Prints
    the computes IQR, lower and upper limit as well as 
    the number of outliers in the data.
    
    Input:
        dv: numeric variable as a pandas Series.
    Output:
        outlier: value of the outlier in the dv.
        outlier_loc: index of the outlier.
    """
    
    Q1, Q3 = np.percentile(train_y, [25, 75])
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR 
    upper_limit = Q3 + 1.5 * IQR 
    
    # store outliers and its location in the df
    outlier = []
    outlier_loc = []
    for i, x in enumerate(train_y): 
        if ((x > upper_limit) or (x < lower_limit)): 
            outlier.append(x)
            outlier_loc.append(i)
            
    print('IQR is: ' + str(IQR) + ' with a lower limit: ' + str(lower_limit) + ' and a upper limit: ' + str(upper_limit) + '\n' +
          'In general we have: ' + str(len(outlier)) + ' outliers in the dataset')
    
    # remove outliers
    train_X.drop(index=outlier_loc, inplace=True)
    train_X.reset_index(drop=True, inplace=True)
    train_y.drop(index=outlier_loc, inplace=True)
    train_y.reset_index(drop=True, inplace=True)
    
    return train_X, train_y

# Analysis/Modeling
Do work here

In [208]:
class CustomTransformerFeatures(TransformerMixin):
    
    def __init__(self):
        super().__init__()
        print('Transforming data. In the CustomTransformerFeatures init method: ')

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # masks bathroom type
        shared_bathroom = (X['bathrooms_text'].str.contains('shared')) | (X['bathrooms_text'].str.contains('Shared'))
        half_bathroom = X['bathrooms_text'].str.contains('half-bath')
        
        # masks sizes
        size_1 = (X['accommodates'] <= 3)
        size_2 = (X['accommodates'] >= 4) & (X['accommodates'] <= 7)
        size_3 = (X['accommodates'] >= 8) & (X['accommodates'] <= 20)
        
        # initiate new columns
        X['bathroom_type'] = 'no shared'
        X['size'] = ''
        

        # bathroom type
        X.loc[shared_bathroom, 'bathroom_type'] = 'shared'
        
        # number of bathrooms (numeric)
        X['bathrooms'] = X['bathrooms_text'].str.extract('(\d+(?:\.\d+)?)', expand=False)
        X.loc[half_bathroom, 'bathrooms'] = 0.5
        X['bathrooms'] = X['bathrooms'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
        
        # sizes
        X.loc[size_1, 'size'] = 'small'
        X.loc[size_2, 'size'] = 'medium'
        X.loc[size_3, 'size'] = 'large'
        
        # drop unnecessary columns
        X.drop(columns='bathrooms_text', inplace=True)
        
        return X

In [209]:
class CustomImputer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        super().__init__()
        self.acc_bedr = {}
        self.acc_beds = {}
        self.acc_bathr = {}
        self.columns_order = None
        print('Transforming data. In the CustomImputer init method: ')

    def fit(self, X, y=None):
        self.acc_bedr = X.groupby(['accommodates'])['bedrooms'].mean()
        self.acc_beds = X.groupby(['accommodates'])['beds'].mean()
        self.acc_bathr = X.groupby(['accommodates'])['bathrooms'].mean()
        self.imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
        self.imp_mean.fit(X.select_dtypes(include=[float, int]))
        self.columns_order = X.columns
        return self

    def transform(self, X, y=None):
        # apply values based on dictionaries 
        for (key_bedr, value_bedr), (key_beds, value_beds), (key_bathr, value_bathr) in zip(self.acc_bedr.items(), 
                                                                                            self.acc_beds.items(), 
                                                                                            self.acc_bathr.items()):
            
            X.loc[((X['bedrooms'].isnull()) & (X['accommodates'] == key_bedr)), 'bedrooms'] = value_bedr
            X.loc[((X['beds'].isnull()) & (X['accommodates'] == key_beds)), 'beds'] = value_beds
            X.loc[((X['bathrooms'].isnull()) & (X['accommodates'] == key_bathr)), 'bathrooms'] = value_bathr
        
        # impute missing values only in numeric columns
        X_imp_mean = pd.DataFrame(data=self.imp_mean.transform(X.select_dtypes(include=[float, int])), index=X.index, 
        
                                  columns=X.select_dtypes(include=[float, int]).columns)
        # concatenate both DataFrames
        X = pd.concat([X.select_dtypes(exclude=['float', 'int']), X_imp_mean], axis=1)
        
        # set columns original order
        X = X.reindex(columns=self.columns_order)
        
        return X

In [210]:
class FeatureSelectionANOVA(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        super().__init__()
        self.cat_features = None
        self.drop_features = []
        print('Transforming data. In the FeatureSelectionANOVA init method: ')

    def fit(self, X, y):
        self.cat_features = X.select_dtypes(include=object).columns.to_list()
        df = pd.concat([X, y], axis=1)
        for feature in self.cat_features:
            p_value = pg.anova(data=df, dv=y.name, between=feature, detailed=True, effsize='np2')['p-unc'][0]
            if p_value > .05:
                self.drop_features.append(feature)
                
        return self
    
    def transform(self, X, y=None):
        X.drop(columns=self.drop_features, inplace=True)
        
        return X

In [211]:
class CustomOrdinal(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        super().__init__()
        self.categories = ['small', 'medium', 'large']
        self.encoder = OrdinalEncoder(categories=[self.categories])
        print('Transforming data. In the CustomOrdinal init method: ')

    def fit(self, X, y=None):              
        self.encoder.fit(X['size'].values.reshape(-1, 1))
        return self
    
    def transform(self, X, y=None):
        X['size'] = self.encoder.transform(X['size'].values.reshape(-1, 1))
        
        return X

In [212]:
class CustomDummy(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        super().__init__()
        print('Transforming data. In the CustomDummy init method: ')

    def fit(self, X, y=None):
        X_dummies = pd.get_dummies(X, drop_first=True)
        self.dummies = X_dummies.columns
                
        return self
    
    def transform(self, X, y=None):
        X_dummies = pd.get_dummies(X)
        X = X_dummies.reindex(columns=self.dummies, fill_value=0)
        
        return X

In [213]:
class MulticollinearityVIF(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        super().__init__()
        self.vif = pd.DataFrame()
        self.removed_features = []
        print('Transforming data. In the MulticollinearityVIF init method: ')

    def fit(self, X, y=None):
        self.vif['variables'] = X.columns
        self.vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
        self.vif.sort_values(by='VIF', ascending=False, inplace=True)
        
        # deep copy of the original VIF values
        vif_original = self.vif.copy(deep=True)
        
        # iterate and removed VIF values greater or equal to 10
        while self.vif['VIF'].iloc[0] >= 10:
            X.drop(columns=self.vif['variables'].iloc[0], inplace=True)
            self.removed_features.append(self.vif['variables'].iloc[0])
            self.vif = pd.DataFrame()
            self.vif['variables'] = X.columns
            self.vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
            self.vif.sort_values(by='VIF', ascending=False, inplace=True)
                
        return self
    
    def transform(self, X, y=None):
        X = X[X.columns[~X.columns.isin(self.removed_features)]]
        
        return X

In [214]:
class CustomStandardizer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        super().__init__()
        self.not_dummy_cols = None
        self.scaler = StandardScaler()
        print('Transforming data. In the CustomStandardizer init method: ')

    def fit(self, X, y=None):
        # list of not dummy cols
        self.not_dummy_cols = X.select_dtypes(exclude='uint8').columns.to_list()
        self.scaler.fit(X[self.not_dummy_cols])
        
        return self
    
    def transform(self, X, y=None):
        # dummy cols df
        X_dummy = X.drop(columns=self.not_dummy_cols).reset_index(drop=True)
        # standardize not dummy cols
        X_scaled = self.scaler.transform(X[self.not_dummy_cols])
        X_scaled = pd.DataFrame(data=X_scaled, columns=X[self.not_dummy_cols].columns)
        
        X = pd.concat([X_scaled, X_dummy], axis=1)
        
        return X

In [215]:
class CustomRandomForestSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        super().__init__()
        self.model = RandomForestRegressor(max_depth=10)
        self.selected_features = None
        print('Transforming data. In the CustomRandomForestSelector init method: ')

    def fit(self, X, y):
        self.model.fit(X, y)
        feat_importances = pd.Series(self.model.feature_importances_, index=X.columns)
        display(feat_importances.sort_values(ascending=False))
        self.selected_features = feat_importances.sort_values(ascending=False).nlargest(10).index.to_list()
        return self
    
    def transform(self, X, y=None):
        X = X[X.columns[X.columns.isin(self.selected_features)]]
        #X.drop(columns=self.selected_features, inplace=True)
        
        return X

### Transformers

In [216]:
CustomTransformerFeatures_imp = CustomTransformerFeatures()
CustomImputer_imp = CustomImputer()
FSel_trans = FeatureSelectionANOVA()
Ordinal = CustomOrdinal()
Dummy = CustomDummy()
MultVIF = MulticollinearityVIF()
Stand = CustomStandardizer()
RFfeature = CustomRandomForestSelector()

Transforming data. In the CustomTransformerFeatures init method: 
Transforming data. In the CustomImputer init method: 
Transforming data. In the FeatureSelectionANOVA init method: 
Transforming data. In the CustomOrdinal init method: 
Transforming data. In the CustomDummy init method: 
Transforming data. In the MulticollinearityVIF init method: 
Transforming data. In the CustomStandardizer init method: 
Transforming data. In the CustomRandomForestSelector init method: 


### Train set

In [217]:
X_feat = CustomTransformerFeatures_imp.fit_transform(train_X)

In [218]:
X_feat.head()

Unnamed: 0,host_is_superhost,neighbourhood_cleansed,latitude,longitude,room_type,accommodates,bedrooms,beds,minimum_nights,maximum_nights,...,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,reviews_per_month,bathroom_type,size,bathrooms
0,f,Centro,36.72602,-4.42028,Entire home/apt,8,4.0,6.0,1,1125,...,,,,,,t,,no shared,large,2.0
1,f,Este,36.71792,-4.3488,Entire home/apt,6,3.0,5.0,3,1124,...,10.0,10.0,10.0,10.0,9.0,t,0.31,no shared,medium,1.0
2,f,Centro,36.71084,-4.42777,Entire home/apt,6,3.0,6.0,1,1125,...,8.0,8.0,8.0,8.0,6.0,t,0.13,no shared,medium,2.0
3,f,Centro,36.72371,-4.42494,Entire home/apt,3,1.0,2.0,1,1125,...,9.0,10.0,10.0,10.0,8.0,t,0.12,no shared,small,1.0
4,f,Centro,36.72428,-4.41632,Entire home/apt,16,7.0,16.0,1,1125,...,,,,,,f,,no shared,large,2.0


In [219]:
X_feat_imp = CustomImputer_imp.fit_transform(X_feat)

In [220]:
X_feat_imp.head()

Unnamed: 0,host_is_superhost,neighbourhood_cleansed,latitude,longitude,room_type,accommodates,bedrooms,beds,minimum_nights,maximum_nights,...,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,reviews_per_month,bathroom_type,size,bathrooms
0,f,Centro,36.72602,-4.42028,Entire home/apt,8,4.0,6.0,1,1125,...,9.409355,9.664391,9.656197,9.573916,9.231478,t,1.212589,no shared,large,2.0
1,f,Este,36.71792,-4.3488,Entire home/apt,6,3.0,5.0,3,1124,...,10.0,10.0,10.0,10.0,9.0,t,0.31,no shared,medium,1.0
2,f,Centro,36.71084,-4.42777,Entire home/apt,6,3.0,6.0,1,1125,...,8.0,8.0,8.0,8.0,6.0,t,0.13,no shared,medium,2.0
3,f,Centro,36.72371,-4.42494,Entire home/apt,3,1.0,2.0,1,1125,...,9.0,10.0,10.0,10.0,8.0,t,0.12,no shared,small,1.0
4,f,Centro,36.72428,-4.41632,Entire home/apt,16,7.0,16.0,1,1125,...,9.409355,9.664391,9.656197,9.573916,9.231478,f,1.212589,no shared,large,2.0


In [221]:
X_feat_imp_sel = FSel_trans.fit_transform(X_feat_imp, train_y)

In [222]:
X_feat_imp_sel.head()

Unnamed: 0,host_is_superhost,latitude,longitude,accommodates,bedrooms,beds,minimum_nights,maximum_nights,availability_365,number_of_reviews,...,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month,bathroom_type,size,bathrooms
0,f,36.72602,-4.42028,8,4.0,6.0,1,1125,63,0,...,9.540984,9.409355,9.664391,9.656197,9.573916,9.231478,1.212589,no shared,large,2.0
1,f,36.71792,-4.3488,6,3.0,5.0,3,1124,177,17,...,10.0,10.0,10.0,10.0,10.0,9.0,0.31,no shared,medium,1.0
2,f,36.71084,-4.42777,6,3.0,6.0,1,1125,365,1,...,6.0,8.0,8.0,8.0,8.0,6.0,0.13,no shared,medium,2.0
3,f,36.72371,-4.42494,3,1.0,2.0,1,1125,365,2,...,10.0,9.0,10.0,10.0,10.0,8.0,0.12,no shared,small,1.0
4,f,36.72428,-4.41632,16,7.0,16.0,1,1125,365,0,...,9.540984,9.409355,9.664391,9.656197,9.573916,9.231478,1.212589,no shared,large,2.0


In [223]:
X_feat_imp_sel_ord = Ordinal.fit_transform(X_feat_imp_sel)

In [224]:
X_feat_imp_sel_ord.head()

Unnamed: 0,host_is_superhost,latitude,longitude,accommodates,bedrooms,beds,minimum_nights,maximum_nights,availability_365,number_of_reviews,...,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month,bathroom_type,size,bathrooms
0,f,36.72602,-4.42028,8,4.0,6.0,1,1125,63,0,...,9.540984,9.409355,9.664391,9.656197,9.573916,9.231478,1.212589,no shared,2.0,2.0
1,f,36.71792,-4.3488,6,3.0,5.0,3,1124,177,17,...,10.0,10.0,10.0,10.0,10.0,9.0,0.31,no shared,1.0,1.0
2,f,36.71084,-4.42777,6,3.0,6.0,1,1125,365,1,...,6.0,8.0,8.0,8.0,8.0,6.0,0.13,no shared,1.0,2.0
3,f,36.72371,-4.42494,3,1.0,2.0,1,1125,365,2,...,10.0,9.0,10.0,10.0,10.0,8.0,0.12,no shared,0.0,1.0
4,f,36.72428,-4.41632,16,7.0,16.0,1,1125,365,0,...,9.540984,9.409355,9.664391,9.656197,9.573916,9.231478,1.212589,no shared,2.0,2.0


In [225]:
X_feat_imp_sel_ord_dummy = Dummy.fit_transform(X_feat_imp_sel_ord)

In [226]:
X_feat_imp_sel_ord_dummy.head()

Unnamed: 0,latitude,longitude,accommodates,bedrooms,beds,minimum_nights,maximum_nights,availability_365,number_of_reviews,review_scores_rating,...,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month,size,bathrooms,host_is_superhost_t,bathroom_type_shared
0,36.72602,-4.42028,8,4.0,6.0,1,1125,63,0,92.503926,...,9.409355,9.664391,9.656197,9.573916,9.231478,1.212589,2.0,2.0,0,0
1,36.71792,-4.3488,6,3.0,5.0,3,1124,177,17,92.0,...,10.0,10.0,10.0,10.0,9.0,0.31,1.0,1.0,0,0
2,36.71084,-4.42777,6,3.0,6.0,1,1125,365,1,60.0,...,8.0,8.0,8.0,8.0,6.0,0.13,1.0,2.0,0,0
3,36.72371,-4.42494,3,1.0,2.0,1,1125,365,2,100.0,...,9.0,10.0,10.0,10.0,8.0,0.12,0.0,1.0,0,0
4,36.72428,-4.41632,16,7.0,16.0,1,1125,365,0,92.503926,...,9.409355,9.664391,9.656197,9.573916,9.231478,1.212589,2.0,2.0,0,0


In [227]:
X_feat_imp_sel_ord_dummy_vif = MultVIF.fit_transform(X_feat_imp_sel_ord_dummy)

In [228]:
X_feat_imp_sel_ord_dummy_vif

Unnamed: 0,beds,minimum_nights,maximum_nights,availability_365,number_of_reviews,reviews_per_month,size,bathrooms,host_is_superhost_t,bathroom_type_shared
0,6.0,1,1125,63,0,1.212589,2.0,2.0,0,0
1,5.0,3,1124,177,17,0.310000,1.0,1.0,0,0
2,6.0,1,1125,365,1,0.130000,1.0,2.0,0,0
3,2.0,1,1125,365,2,0.120000,0.0,1.0,0,0
4,16.0,1,1125,365,0,1.212589,2.0,2.0,0,0
...,...,...,...,...,...,...,...,...,...,...
3697,5.0,5,21,308,1,0.080000,2.0,2.0,0,0
3698,0.0,1,1125,365,3,0.210000,2.0,2.0,0,0
3699,2.0,2,1125,90,0,1.212589,0.0,1.0,0,0
3700,1.0,1,15,35,0,1.212589,0.0,2.0,0,1


In [229]:
X_feat_imp_sel_ord_dummy_vif_stand = Stand.fit_transform(X_feat_imp_sel_ord_dummy_vif)

In [230]:
X_feat_imp_sel_ord_dummy_vif_stand

Unnamed: 0,beds,minimum_nights,maximum_nights,availability_365,number_of_reviews,reviews_per_month,size,bathrooms,host_is_superhost_t,bathroom_type_shared
0,1.648879,-0.195292,0.693785,-1.242382,-0.571344,1.884532e-16,2.162709,1.115171,0,0
1,1.158408,-0.053935,0.691712,-0.361216,-0.259850,-7.660434e-01,0.464290,-0.509600,0,0
2,1.648879,-0.195292,0.693785,1.091934,-0.553021,-9.188126e-01,0.464290,1.115171,0,0
3,-0.313006,-0.195292,0.693785,1.091934,-0.534698,-9.272998e-01,-1.234130,-0.509600,0,0
4,6.553592,-0.195292,0.693785,1.091934,-0.571344,1.884532e-16,2.162709,1.115171,0,0
...,...,...,...,...,...,...,...,...,...,...
3697,1.158408,0.087422,-1.594706,0.651351,-0.553021,-9.612485e-01,2.162709,1.115171,0,0
3698,-1.293948,-0.195292,0.693785,1.091934,-0.516374,-8.509152e-01,2.162709,1.115171,0,0
3699,-0.313006,-0.124614,0.693785,-1.033685,-0.571344,1.884532e-16,-1.234130,-0.509600,0,0
3700,-0.803477,-0.195292,-1.607143,-1.458808,-0.571344,1.884532e-16,-1.234130,1.115171,0,1


In [231]:
X_feat_imp_sel_ord_dummy_vif_stand_RFsel = RFfeature.fit_transform(X_feat_imp_sel_ord_dummy_vif_stand, train_y)

reviews_per_month       0.381879
number_of_reviews       0.171651
availability_365        0.164334
minimum_nights          0.092600
bathrooms               0.065330
maximum_nights          0.053594
size                    0.036946
beds                    0.020668
bathroom_type_shared    0.011276
host_is_superhost_t     0.001723
dtype: float64

In [232]:
X_feat_imp_sel_ord_dummy_vif_stand_RFsel

Unnamed: 0,beds,minimum_nights,maximum_nights,availability_365,number_of_reviews,reviews_per_month,size,bathrooms,host_is_superhost_t,bathroom_type_shared
0,1.648879,-0.195292,0.693785,-1.242382,-0.571344,1.884532e-16,2.162709,1.115171,0,0
1,1.158408,-0.053935,0.691712,-0.361216,-0.259850,-7.660434e-01,0.464290,-0.509600,0,0
2,1.648879,-0.195292,0.693785,1.091934,-0.553021,-9.188126e-01,0.464290,1.115171,0,0
3,-0.313006,-0.195292,0.693785,1.091934,-0.534698,-9.272998e-01,-1.234130,-0.509600,0,0
4,6.553592,-0.195292,0.693785,1.091934,-0.571344,1.884532e-16,2.162709,1.115171,0,0
...,...,...,...,...,...,...,...,...,...,...
3697,1.158408,0.087422,-1.594706,0.651351,-0.553021,-9.612485e-01,2.162709,1.115171,0,0
3698,-1.293948,-0.195292,0.693785,1.091934,-0.516374,-8.509152e-01,2.162709,1.115171,0,0
3699,-0.313006,-0.124614,0.693785,-1.033685,-0.571344,1.884532e-16,-1.234130,-0.509600,0,0
3700,-0.803477,-0.195292,-1.607143,-1.458808,-0.571344,1.884532e-16,-1.234130,1.115171,0,1


### Test set

In [238]:
X_t_feat = CustomTransformerFeatures_imp.transform(test_X)
X_t_feat_imp = CustomImputer_imp.transform(X_t_feat)
X_t_feat_imp_sel = FSel_trans.transform(X_t_feat_imp, test_y)
X_t_feat_imp_sel_ord = Ordinal.transform(X_t_feat_imp_sel)
X_t_feat_imp_sel_ord_dummy = Dummy.transform(X_t_feat_imp_sel_ord)
X_t_feat_imp_sel_ord_dummy_vif = MultVIF.transform(X_t_feat_imp_sel_ord_dummy)
X_t_feat_imp_sel_ord_dummy_vif_stand = Stand.transform(X_t_feat_imp_sel_ord_dummy_vif)
X_t_feat_imp_sel_ord_dummy_vif_stand_RFsel = RFfeature.transform(X_t_feat_imp_sel_ord_dummy_vif_stand, test_y)

In [239]:
X_t_feat_imp_sel_ord_dummy_vif_stand_RFsel

Unnamed: 0,beds,minimum_nights,maximum_nights,availability_365,number_of_reviews,reviews_per_month,size,bathrooms,host_is_superhost_t,bathroom_type_shared
0,0.667937,-0.195292,0.693785,0.434924,1.114386,2.014946e-01,0.46429,1.115171,1,0
1,-0.313006,-0.124614,0.693785,0.821401,0.564692,2.099818e-01,0.46429,-0.509600,1,0
2,0.667937,-0.053935,-0.601783,-1.033685,-0.094942,-2.058898e-01,0.46429,1.115171,0,0
3,-0.803477,0.087422,0.693785,-1.404702,-0.516374,-3.501718e-01,-1.23413,-0.509600,0,0
4,1.648879,0.228780,-1.327301,0.775023,-0.571344,1.884532e-16,0.46429,1.927556,0,1
...,...,...,...,...,...,...,...,...,...,...
1582,-0.803477,-0.195292,0.693785,1.084204,-0.534698,-5.029410e-01,-1.23413,-0.509600,0,0
1583,-0.803477,-0.124614,-1.265113,-1.481997,-0.516374,1.517007e+00,-1.23413,-0.509600,1,0
1584,-0.803477,-0.124614,0.693785,-1.729342,0.949478,6.289578e-03,0.46429,-0.509600,1,0
1585,-0.803477,-0.195292,0.693785,1.091934,-0.479728,-6.726845e-01,-1.23413,0.302785,0,1


# Models

In process

In [None]:
lasso = Lasso()
ridge = Ridge()
elasticnet = ElasticNet()
rfr = RandomForestResgresor()

In [None]:
param_grid = {'alpha': np.array([1, 0.1, 0.01, 0.001, 0.0001, 0])}
folds = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
lasso_cv = GridSearchCV(estimator=lasso, 
                            param_grid=param_grid, 
                            cv=folds,
                            scoring='neg_mean_squared_error')

In [None]:
# random forest
# grid search params
        folds = KFold(n_splits=5, shuffle=True, random_state=42)

        n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 5)]
        max_features = ['auto', 'sqrt']
        max_depth = [int(x) for x in np.linspace(10, 110, num = 6)]
        max_depth.append(None)

        # param_grid
        param_grid = {'n_estimators': n_estimators,
                      'max_features': ['auto', 'sqrt', 'log2'],
                      'max_depth': max_depth,
                      'min_samples_split': [2, 5, 10],
                      'min_samples_leaf': [1, 2, 4],
                      'bootstrap': [True, False]}

        rf_cv = RandomizedSearchCV(estimator=rf,
                                   param_distributions=param_grid,
                                   n_iter=200,
                                   cv=folds,
                                   scoring='neg_mean_squared_error')