In [1]:
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

In [2]:
def deleteId(ds):
    return ds.drop(columns='Id')

def deleteNaN(train_ds, test_ds, critval):
    fullsize = train_ds.shape[0]
    new_train = train_ds.copy()
    new_test = test_ds.copy()
    for feature in new_train.columns:
        nulls = new_train[feature].isnull().sum()
        percent = nulls / fullsize
        if (percent > critval):
            #print(f'Feature {feature} was removed: \nNaNs {(percent * 100):.2f}%')
            new_train = new_train.drop(columns=feature)
            new_test = new_test.drop(columns=feature)
    return new_train, new_test

def fillNaN(train_ds, test_ds, method='ffill'):
    new_train = train_ds.copy()
    new_test = test_ds.copy()
    new_train = new_train.fillna(method=method)
    new_test = new_test.fillna(method=method)
    return new_train, new_test

def convertToNumeric(train_ds, test_ds):
    new_train = train_ds.copy()
    new_test = test_ds.copy()
    LE = LabelEncoder()
    for feature in new_train.columns[:-1]:
        if (new_train[feature].dtype == 'object'):
            new_train[feature] = LE.fit_transform(new_train[feature])
            new_test[feature] = LE.fit_transform(new_test[feature])
    return new_train, new_test

def check_error(preds, gt):
    print('Absolute Error:', metrics.mean_absolute_error(preds, gt))
    print('Squared Error:', metrics.mean_squared_error(preds, gt))
    print('Squared Log Error:', metrics.mean_squared_log_error(preds, gt))

def deleteCorrelation(train_ds, test_ds, threshold):
    corr_matrix = train_ds.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    new_train = train_ds.copy()
    new_test = test_ds.copy()
    for feature in to_drop:
        new_train = new_train.drop(columns=feature)
        new_test = new_test.drop(columns=feature)

    return new_train, new_test

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train = train.drop_duplicates()

train = deleteId(train) 
train, test = deleteNaN(train, test ,critval=0.4)   
train, test = fillNaN(train, test) 
train, test = convertToNumeric(train, test) 
train, test = deleteCorrelation(train, test, threshold=0.8) 

y_train = np.log1p(train['SalePrice'].values)
x_train = train.drop(columns='SalePrice').values
x_super_test = test.values

X_train, X_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1, random_state=98987)

In [5]:
parameters = {
    'max_depth': (100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, None),
    'max_features':('auto', 'sqrt', 'log2')
}

model = RandomForestRegressor()
gs_model = GridSearchCV(model, parameters)
gs_model.fit(X_train[:300], y_train[:300])
print(f"Best parameters: {gs_model.best_params_}")

ValueError: Parameter grid for parameter (criterion) needs to be a list or numpy array, but got (<class 'set'>). Single values need to be wrapped in a list with one element.