<a href="https://colab.research.google.com/github/stiwari-ds/data-science-competitions/blob/main/hackerearth/get_a_room_hackathon/notebooks/02_linear_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
import gc
import os
import warnings
warnings.filterwarnings(action='ignore')

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer
from sklearn.preprocessing import PowerTransformer, QuantileTransformer
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn import linear_model as lm
from sklearn.metrics import mean_squared_error, r2_score

SEED = 23
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

### Data 

In [2]:
data_url = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/hackerearth/get_a_room_hackathon/data/'

train = pd.read_csv(data_url + 'processed/train.csv')
test = pd.read_csv(data_url + 'processed/test.csv')

target = pd.read_csv(data_url + 'raw/train.csv', usecols=['Habitability_score'])

In [3]:
train.head()

Unnamed: 0,Property_Type,Property_Area,Number_of_Windows,Number_of_Doors,Furnishing,Frequency_of_Powercuts,Power_Backup,Water_Supply,Traffic_Density_Score,Crime_Rate,Dust_and_Noise,Air_Quality_Index,Neighborhood_Review,aqi_range
0,0,106,3,1,1,0,0,2,5.89,1,1,90,3.86,1
1,0,733,2,2,0,1,0,1,4.37,0,1,96,3.55,1
2,0,737,4,2,2,0,0,2,7.45,1,1,121,3.81,2
3,0,900,3,2,0,2,1,2,6.16,3,1,100,1.34,1
4,3,2238,14,6,2,0,0,3,5.46,0,1,116,4.77,2


In [4]:
test.head()

Unnamed: 0,Property_ID,Property_Type,Property_Area,Number_of_Windows,Number_of_Doors,Furnishing,Frequency_of_Powercuts,Power_Backup,Water_Supply,Traffic_Density_Score,Crime_Rate,Dust_and_Noise,Air_Quality_Index,Neighborhood_Review,aqi_range
0,0x6e93,0,293,3,1,0,0,0,2,7.28,3,1,152,2.52,3
1,0x8787,0,586,4,1,1,0,0,1,7.63,0,1,92,4.16,1
2,0x6c17,4,305,1,2,1,1,0,3,5.39,2,1,90,2.92,1
3,0x9dbd,0,258,2,1,1,1,0,3,7.53,1,1,158,3.45,3
4,0xbfde,3,3031,12,4,2,0,0,3,8.79,3,2,186,2.72,3


In [5]:
target.head()

Unnamed: 0,Habitability_score
0,71.98
1,71.2
2,71.39
3,31.46
4,93.7


### Feature sets

In [6]:
features = list(train.columns)

num_features = ['Property_Area', 'Number_of_Windows', 
                'Number_of_Doors', 'Traffic_Density_Score', 
                'Air_Quality_Index', 'Neighborhood_Review']

cat_features = [f for f in features if f not in num_features]

onehot_cat_features = ['Property_Type']
ordinal_cat_features = [f for f in cat_features if f not in onehot_cat_features]

In [7]:
train[ordinal_cat_features] = train[ordinal_cat_features].astype('int8')
train[onehot_cat_features] = train[onehot_cat_features].astype('category')

test[ordinal_cat_features] = test[ordinal_cat_features].astype('int8')
test[onehot_cat_features] = test[onehot_cat_features].astype('category')

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39499 entries, 0 to 39498
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Property_Type           39499 non-null  category
 1   Property_Area           39499 non-null  int64   
 2   Number_of_Windows       39499 non-null  int64   
 3   Number_of_Doors         39499 non-null  int64   
 4   Furnishing              39499 non-null  int8    
 5   Frequency_of_Powercuts  39499 non-null  int8    
 6   Power_Backup            39499 non-null  int8    
 7   Water_Supply            39499 non-null  int8    
 8   Traffic_Density_Score   39499 non-null  float64 
 9   Crime_Rate              39499 non-null  int8    
 10  Dust_and_Noise          39499 non-null  int8    
 11  Air_Quality_Index       39499 non-null  int64   
 12  Neighborhood_Review     39499 non-null  float64 
 13  aqi_range               39499 non-null  int8    
dtypes: category(1), float6

[EDA notebook](https://github.com/stiwari-ds/data-science-competitions/blob/main/hackerearth/get_a_room_hackathon/notebooks/01_eda.ipynb)

In [9]:
top_features = ['Neighborhood_Review', 'Furnishing', 'Crime_Rate', 'Power_Backup']

rejected_features = ['Number_of_Doors', 'Traffic_Density_Score', 'aqi_range']
reduced_features = [f for f in features if f not in rejected_features]

# Modeling pipelines

**Preprocessing**

In [10]:
onehot_encoder = Pipeline([
    ('onehot', OneHotEncoder(drop=None, handle_unknown='ignore'))
])

numerical_pipelines = {
    1: Pipeline([('standard_scaler', StandardScaler())]),

    2: Pipeline([('robust_scaler', RobustScaler())]),

    3: Pipeline([
        ('standard_scaler', StandardScaler()),
        ('minmax_scaler', MinMaxScaler())
    ]),

    4:Pipeline([
        ('robust_scaler', RobustScaler()),
        ('minmax_scaler', MinMaxScaler())
    ]),

    5: Pipeline([
        ('quantile_uniform', QuantileTransformer(output_distribution='uniform',
                                                 random_state=SEED))
    ]),

    6: Pipeline([
        ('quantile_normal', QuantileTransformer(output_distribution='normal', 
                                                random_state=SEED))
    ]),

    7: Pipeline([('power_transform', PowerTransformer())]),

    8: Pipeline([
        ('kbins', KBinsDiscretizer(encode='ordinal', strategy='quantile'))
    ])
}

In [11]:
i = 1
preprocessors_dict = {}
for num_pipe in numerical_pipelines.values():
    preprocessors_dict[i] = ColumnTransformer(
        transformers=[
            #one-hot encode only 'Property_Type', remaining categorical features are ordinal
            ('cat', onehot_encoder, make_column_selector(dtype_include=['category'])),
            ('num', num_pipe, make_column_selector(dtype_include=['int64', 'float64']))
        ],
        remainder='passthrough'
    )
    i += 1
    preprocessors_dict[i] = ColumnTransformer(
        transformers=[
            #one-hot encode all the categorical features
            ('cat', onehot_encoder, make_column_selector(dtype_include=['category', 'int8'])),
            ('num', num_pipe, make_column_selector(dtype_include=['int64', 'float64']))
        ],
        remainder='passthrough'
    )
    i += 1

del i

In [12]:
preprocessors_dict[16]

ColumnTransformer(remainder='passthrough',
                  transformers=[('cat',
                                 Pipeline(steps=[('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7f6fdfc86050>),
                                ('num',
                                 Pipeline(steps=[('kbins',
                                                  KBinsDiscretizer(encode='ordinal'))]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7f6fdfc86090>)])

**Regressors**

In [13]:
regressors_dict = {
    1: lm.LinearRegression(),

    2: lm.Ridge(random_state=SEED),

    3: lm.ElasticNet(random_state=SEED),

    4: lm.Lars(normalize=False, random_state=SEED),

    5: lm.Lasso(random_state=SEED),

    6: lm.LassoLars(random_state=SEED),

    7: lm.LassoLarsIC(criterion='aic', normalize=False), #AkaikeIC

    8: lm.LassoLarsIC(criterion='bic', normalize=False), #BayesIC

    9: lm.OrthogonalMatchingPursuit(normalize=False),

    10: lm.ARDRegression(),

    11: lm.BayesianRidge(),

    12: lm.HuberRegressor(),

    13: lm.RANSACRegressor(random_state=SEED),

    14: lm.TheilSenRegressor(random_state=SEED),

    15: lm.PassiveAggressiveRegressor(early_stopping=True, random_state=SEED)
}

**Model pipeline = Preprocessor + Regressor**

In [14]:
models_dict = {}

for i, preprocessor in preprocessors_dict.items():
    for j, regressor in regressors_dict.items():
        model_name = f'{i}_{j}'
        models_dict[model_name] = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', regressor)
        ])

In [15]:
models_dict['1_13']

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f6fdfc913d0>),
                                                 ('num',
                                                  Pipeline(steps=[('standard_scaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f6fdfc91510>)])),
                ('regressor', RANSACRegressor(random_state=23))])

**Model evaluation**

In [21]:
def evaluate_model(model, features, verbose=False):
    preds_test = []
    scores_r2 = [] #validation set R2 scores
    scores_rmse = [] #validation set RMSE
    
    X, X_test, y = train[features], test[features], target
    
    cv = KFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds_val = model.predict(X_val)
        preds_test.append(model.predict(X_test))
        
        scores_rmse.append(mean_squared_error(y_val, preds_val, squared=False))
        scores_r2.append(r2_score(y_val, preds_val))
    
    if verbose:
        scores_df = pd.DataFrame.from_dict({
            'RMSE': scores_rmse,
            'R2-score': scores_r2
        })
        scores_df.index.name = 'Fold'
        display(scores_df)
    
    print(f'Avg. RMSE = {np.mean(scores_rmse):.4f}, ' \
          f'Avg. R2-score = {max(0, np.mean(scores_r2)):.4f}')
    
    preds_test = np.mean(np.column_stack(preds_test), axis=1)
    return preds_test, max(0, np.mean(scores_r2))

In [22]:
predictions_dict = {}
scores_dict = {}

### Configuration 1: Target column not transformed

**Naming convention for predictions:  (model_num)-(feature_set)-(target_configuration)**

In [None]:
for model_num, model in models_dict.items():
    print(f'\n--- Model number: {model_num} ---')
    
    print(f'Feature set - TOP:')
    #top - 0, reduced - 1, all - 2
    predictions_dict[f'{model_num}-0-1'], scores_dict[f'{model_num}-0-1'] = evaluate_model(model, top_features)

    print(f'Feature set - REDUCED')
    predictions_dict[f'{model_num}-1-1'], scores_dict[f'{model_num}-1-1'] = evaluate_model(model, reduced_features)

    print(f'Feature set - ALL')
    predictions_dict[f'{model_num}-2-1'], scores_dict[f'{model_num}-2-1'] = evaluate_model(model, features)

    _ = gc.collect()

### Configuration 2: Manually transformed target

In [None]:
for model_num, model in models_dict.items():
    transformed_model = TransformedTargetRegressor(
        regressor=model,
        func=lambda x: x / 20,
        inverse_func=lambda x: x * 20,
        check_inverse=True
    )
    print(f'\n--- Model number: {model_num} ---')
    
    print(f'Feature set - TOP')
    #top - 0, reduced - 1, all - 2
    predictions_dict[f'{model_num}-0-2'], scores_dict[f'{model_num}-0-2'] = evaluate_model(transformed_model, top_features)

    print(f'Feature set - REDUCED')
    predictions_dict[f'{model_num}-1-2'], scores_dict[f'{model_num}-1-2'] = evaluate_model(transformed_model, reduced_features)

    print(f'Feature set - ALL')
    predictions_dict[f'{model_num}-2-2'], scores_dict[f'{model_num}-2-2'] = evaluate_model(transformed_model, features)

    _ = gc.collect()

### Configuration 3: Quantile-transformed target

In [None]:
for model_num, model in models_dict.items():
    transformed_model = TransformedTargetRegressor(
        regressor=model,
        transformer=QuantileTransformer(output_distribution='normal', random_state=SEED)
    )
    print(f'\n--- Model number: {model_num} ---')
    
    print(f'Feature set - TOP')
    #top - 0, reduced - 1, all - 2
    predictions_dict[f'{model_num}-0-3'], scores_dict[f'{model_num}-0-3'] = evaluate_model(transformed_model, top_features)

    print(f'Feature set - REDUCED')
    predictions_dict[f'{model_num}-1-3'], scores_dict[f'{model_num}-1-3'] = evaluate_model(transformed_model, reduced_features)

    print(f'Feature set - ALL')
    predictions_dict[f'{model_num}-2-3'], scores_dict[f'{model_num}-2-3'] = evaluate_model(transformed_model, features)

    _ = gc.collect()

In [27]:
max(scores_dict, key=scores_dict.get)

'6_12-2-3'

In [28]:
scores_dict['6_12-2-3']

0.6956465499807586

In [32]:
predictions_dict['6_12-2-3'][:10]

array([32.02256803, 75.02372683, 67.77698567, 72.72623188, 72.23317077,
       68.23009626, 80.50028303, 71.75880382, 79.74746496, 78.68462865])

# Generating submission file

In [36]:
sub_16_12 = pd.DataFrame({
    'Property_ID': test.Property_ID,
    'Habitability_score': predictions_dict['16_12-2-3']
})
sub_16_12.to_csv('02-sub-16_12-2-3.csv', index=False)

In [37]:
sub_6_12 = pd.DataFrame({
    'Property_ID': test.Property_ID,
    'Habitability_score': predictions_dict['6_12-2-3']
})
sub_6_12.to_csv('02-sub-6_12-2-3.csv', index=False)

In [38]:
!head 02-sub-6_12-2-3.csv

Property_ID,Habitability_score
0x6e93,32.02256802546572
0x8787,75.02372683131316
0x6c17,67.77698567251244
0x9dbd,72.72623187636742
0xbfde,72.23317076824536
0x6a39,68.230096260093
0x47a6,80.50028302569049
0x7687,71.75880381779663
0x963a,79.74746495790177
