In [1]:
import pandas as pd
from assets.data_loader import DataLoader
from assets.MissingData import MissingData
from assets.graphs import CreateGraph
from assets.Data_analysis import DataAnalysis
from assets.make_models import NumericScaler, CatScaler, MakeModel
from sklearn.impute import KNNImputer,SimpleImputer
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor, StackingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import numpy as np
import dill as pickle
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 110)

In [2]:
data_loader = DataLoader(train_file="data/train.csv", test_file="data/test.csv")
df_train, df_test = data_loader.load_data()

In [3]:
num_to_cat = ['YrSold', 'MoSold', 'MSSubClass', 'GarageYrBlt']
df_train[num_to_cat] = df_train[num_to_cat].astype('str')
df_test[num_to_cat] = df_test[num_to_cat].astype('str')

In [4]:
columns_to_fill =['PoolQC','MiscFeature','Alley','Fence','MasVnrType','FireplaceQu',
               'GarageQual','GarageCond','GarageFinish','GarageType', 'Electrical',
                'Functional', 'Exterior2nd', 'Exterior1st',
               'BsmtExposure','BsmtCond','BsmtQual','BsmtFinType1','BsmtFinType2'
               ]
df_train[columns_to_fill] = df_train[columns_to_fill].fillna("None")
df_test[columns_to_fill] = df_test[columns_to_fill].fillna("None")

In [5]:
Y_train = df_train['SalePrice'].copy()
df_train = df_train.drop(['SalePrice'],axis=1)
to_del = ['Utilities', 'Electrical', 'Street','PoolQC', 'GarageYrBlt', "Id"]
df_train = df_train.drop(to_del, axis=1)
df_test = df_test.drop(to_del, axis=1)

In [6]:
numerical_features=['GarageCars','OverallCond','OverallQual','YearBuilt','GrLivArea', 'TotalBsmtSF','LotArea']
categorical_features = ['Neighborhood']

In [7]:
outliers = [898, 1423, 1169, 278, 1046, 409, 1181, 1182, 1065, 1324, 45, 559, 688, 691, 822, 313, 581, 462, 1359, 473, 218, 1244, 608, 864, 238, 496, 885, 632,523,1298,30, 88, 631, 1322]
df_train=df_train.drop(df_train.index[outliers])
Y_train = Y_train.drop(Y_train.index[outliers])

In [8]:
numerical_transformer = Pipeline(
    steps=[
        ('KNNImputer', KNNImputer(n_neighbors=10)),
        ('scaler', NumericScaler())
    ]
)
categorical_transformer = Pipeline(
    steps=[
       ("SimpleImputer",SimpleImputer(strategy="constant", missing_values=np.nan,fill_value="NULL" )),
        ('scaler', CatScaler())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [9]:
MLModels = MakeModel()
MLModels.add_preprocessor(preprocessor)

<assets.make_models.MakeModel at 0x16bffbd90>

In [10]:
xgb = 'XGBoost'
MLModels.add_model(xgb,XGBRegressor(booster='gbtree', objective="reg:linear", random_state=42, reg_alpha=0.00006))
parameters = {
    "regressor__regressor__eta":[0.09645082865558717],
    "regressor__regressor__n_estimators": [425],
    "regressor__regressor__max_depth" : [4],
    "regressor__regressor__subsample" : [0.75],
    "regressor__regressor__colsample_bytree" : [0.6998820355976964],
    "regressor__regressor__min_child_weight" : [6],
    "regressor__regressor__gamma": [0.05]
              }
#print(df_train.shape)
MLModels.gridsearch(xgb,df_train, Y_train, parameters)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex


Najlepszy estymator: 
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('KNNImputer',
                                                                   KNNImputer(n_neighbors=10)),
                                                                  ('scaler',
                                                                   NumericScaler())]),
                                                  ['GarageCars', 'OverallCond',
                                                   'OverallQual', 'YearBuilt',
                                                   'GrLivArea', 'TotalBsmtSF',
                                                   'LotArea']),
                                                 ('cat',
                                                  Pipeline(steps=[('SimpleImputer',
                                                                   SimpleImputer(fill_value='NULL',
    

<assets.make_models.MakeModel at 0x16bffbd90>

In [12]:
pickle.dump(MLModels.grid[0],open('assets/xgb.sav', 'wb'))