In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import random
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
# Preproccessing
from sklearn.preprocessing import (
    StandardScaler,
    RobustScaler,
    MinMaxScaler,
    MaxAbsScaler,
    OneHotEncoder,
    Normalizer,
    Binarizer
)
# Decomposition
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
# Feature Selection
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectFromModel
# Model Eval
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error,accuracy_score,precision_score,f1_score,recall_score
# Models
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor,AdaBoostRegressor,VotingRegressor,BaggingRegressor,RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from catboost import CatBoost,CatBoostRegressor
from xgboost import XGBRegressor,XGBRFRegressor
# Other
import pickle
import wandb

PROJECT_NAME = 'House-Prices-Advanced-Regression-Techniques'
device = 'cpu'

In [None]:
np.random.seed(99)
random.seed(99)

In [None]:
data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

In [None]:
def valid(model,X,y,valid=False):
    preds = model.predict(X)
    if valid:
        results = {
            'val mean_absolute_error':mean_absolute_error(y_true=y,y_pred=preds),
            'val mean_squared_error':mean_squared_error(y_true=y,y_pred=preds),
        }
    else:
        results = {
            'mean_absolute_error':mean_absolute_error(y_true=y,y_pred=preds),
            'mean_squared_error':mean_squared_error(y_true=y,y_pred=preds),
        }
    return results

In [None]:
def fe(data,col):
    max_num = data[col].quantile(0.95)
    min_num = data[col].quantile(0.05)
    data = data[data[col] > max_num]
    data = data[data[col] < min_num]
    return data

In [None]:
def object_to_int(data,col):
    data_col = data[col].to_dict()
    idx = -1
    labels_and_int_index = {}
    for data_col_vals in data_col.values():
        if data_col_vals not in labels_and_int_index.keys():
            idx += 1
            labels_and_int_index[data_col_vals] = idx
    new_data = []
    for data_col_vals in data_col.values():
        new_data.append(labels_and_int_index[data_col_vals])
    data[col] = new_data
    return data,idx,labels_and_int_index,new_data

In [None]:
def make_submission(model,name):
    data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
    ids = data['Id']
    for col,dtype,num_of_missing in zip(list(data.columns),data.dtypes,data.isna().sum()):
        if dtype == float or dtype == int:
            data[col] = data[col].fillna(data[col].median())
        else:
            data[col] = data[col].fillna(0)
            data,idx,labels_and_int_index,new_data = object_to_int(data,col)
            data[col].replace(0,data[col].median())
    preds = model.predict(data)
    df = pd.DataFrame({'Id':ids,'SalePrice':preds})
    df.to_csv(f'./{name}.csv',index=False)

In [None]:
def train(model,X_train,X_test,y_train,y_test,name):
    model.fit(X_train,y_train)
    print(valid(model,X_train,y_train,True))
    print(valid(model,X_train,y_train,False))
    make_submission(model,name)
    return model

In [None]:
for col,dtype,num_of_missing in zip(list(data.columns),data.dtypes,data.isna().sum()):
    if dtype == float or dtype == int:
        data[col] = data[col].fillna(data[col].median())
    else:
        data[col] = data[col].fillna(0)
        data,idx,labels_and_int_index,new_data = object_to_int(data,col)
        data[col].replace(0,data[col].median())

In [None]:
data

In [None]:
for col,dtype,num_of_missing in zip(list(data.columns),data.dtypes,data.isna().sum()):
    print(col,dtype,num_of_missing)

In [None]:
X = data.drop('SalePrice',axis=1)

In [None]:
y = data['SalePrice']

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.0625)

In [None]:
train(GradientBoostingRegressor(),X_train,X_test,y_train,y_test,name='baseline-0')

In [None]:
from sklearn.preprocessing import (
    StandardScaler,
    RobustScaler,
    MinMaxScaler,
    MaxAbsScaler,
    OneHotEncoder,
    Normalizer,
    Binarizer
)

In [None]:
preprocessings = [
#     StandardScaler,
#     RobustScaler,
#     MinMaxScaler,
#     MaxAbsScaler,
#     OneHotEncoder,
    Normalizer,
    Binarizer
]

In [None]:
X_train_old = X_train.copy()
X_test_old = X_test.copy()

In [None]:
for preprocessing in preprocessings:
    X_train = X_train_old.copy()
    X_test = X_test_old.copy()
    try:
        preprocessing = preprocessing()
        X_train = preprocessing.fit_transform(X_train)
        X_test = preprocessing.transform(X_test)
    except:
        preprocessing = preprocessing
        X_train = preprocessing.fit_transform(X_train)
    train(GradientBoostingRegressor(),X_train,X_test,y_train,y_test,name=f'{preprocessing}-preprocessing')

In [None]:
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA

In [None]:
pca = PCA()
kernelpca = KernelPCA()

In [None]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectFromModel

In [None]:
# fs = VarianceThreshold()
# X_train = X_train_old.copy()
# X_test = X_test_old.copy()
# try:
#     X_train = fs.fit_transform(X_train)
#     X_test = fs.transform(X_test)
# except:
#     X_train = fs.fit_transform(X_train)
# train(GradientBoostingRegressor(),X_train,X_test,y_train,y_test,name=f'{fs}-fs')

In [None]:
models = [
    ['KNeighborsRegressor',KNeighborsRegressor],
    ['LogisticRegression',LogisticRegression],
    ['DecisionTreeRegressor',DecisionTreeRegressor],
    ['GradientBoostingRegressor',GradientBoostingRegressor],
    ['AdaBoostRegressor',AdaBoostRegressor],
    ['RandomForestRegressor',RandomForestRegressor],
    ['BaggingRegressor',BaggingRegressor],
    ['GaussianNB',GaussianNB],
    ['ExtraTreesRegressor',ExtraTreesRegressor],
    ['CatBoost',CatBoost],
    ['CatBoostRegressor',CatBoostRegressor],
    ['XGBRegressor',XGBRegressor],
    ['XGBRFRegressor',XGBRFRegressor],
    ['ExtraTreesRegressor',ExtraTreesRegressor],
]

In [None]:
# for model in models:
#     print(model)
#     train(model[1](),X_train,X_test,y_train,y_test,f'{model[0]}-model')

In [None]:
# DecisionTreeRegressor
# ExtraTreesRegressor

In [None]:
# param_grid = {
#     'criterion':['squared_error','friedman_mse','absolute_error','poisson'],
#     "splitter":['best','random'],
#     'max_depth':[None,1,2,3,4,5],
#     'max_features':['auto','sqrt','log2',None]
    
# }
# model = GridSearchCV(DecisionTreeRegressor(),cv=2,param_grid=param_grid).fit(X_train,y_train)

In [None]:
# model.best_params_

In [None]:
train(DecisionTreeRegressor(criterion='absolute_error',max_depth=5,max_features=None,splitter='best'),X_train,X_test,y_train,y_test,f'Final1')

In [None]:
# param_grid = {
#     'n_estimators':[100,125,250,500,1000],
#     'criterion':['squared_error','absolute_error'],    
#     'max_depth':[None,1,2,3,4,5],
#     'max_features':['auto','sqrt','log2',None],
#     'bootstrap':[False,True],
#     'oob_score':[False,True],
#     'warm_start':[False,True]
# }
# model = GridSearchCV(ExtraTreesRegressor(),cv=2,param_grid=param_grid).fit(X_train,y_train)

In [None]:
# model.best_params_

In [None]:
train(ExtraTreesRegressor(),X_train,X_test,y_train,y_test,f'Final2')