In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

In [3]:
# Load the data
housing = pd.read_excel('housing.xlsx')

In [8]:
num_cols = housing.select_dtypes(exclude='O').columns
cat_cols = housing.select_dtypes(include='O').columns

In [9]:
num_pipeline = Pipeline([('impute', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])

In [11]:
cat_pipeline = Pipeline([('oe', OneHotEncoder(handle_unknown='ignore'))])

In [12]:
preprocessing = ColumnTransformer([('num', num_pipeline, num_cols), ('cat', cat_pipeline, cat_cols)])

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numerical_pipeline, numerical_features),
#         ('cat', categorical_pipeline, categorical_features)
#     ])

In [15]:
# final_pipeline = Pipeline([('preproc', preprocessing), ('mdl', xgb.XGBRegressor())])
data_cleaned = preprocessing.fit_transform(housing)

In [16]:
data_cleaned

array([[-1.32783522,  1.05254828,  0.98214266, ...,  0.        ,
         1.        ,  0.        ],
       [-1.32284391,  1.04318455, -0.60701891, ...,  0.        ,
         1.        ,  0.        ],
       [-1.33282653,  1.03850269,  1.85618152, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.8237132 ,  1.77823747, -0.92485123, ...,  0.        ,
         0.        ,  0.        ],
       [-0.87362627,  1.77823747, -0.84539315, ...,  0.        ,
         0.        ,  0.        ],
       [-0.83369581,  1.75014627, -1.00430931, ...,  0.        ,
         0.        ,  0.        ]])

In [29]:
final_pipeline = Pipeline([('preprocess', preprocessing), ('xgb', xgb.XGBRegressor())])

In [30]:
final_pipeline.fit(housing, housing.median_house_value)

In [31]:
gbm_param = {
    'xgb__n_estimators': [40,50,60],
    'xgb__max_depth':[4,6,8],
    'xgb__learning_rate':[.4,.2,.5,.8],
    'xgb__colsample_bytree':[.01,.02,.03,.4,.5,.8]
}

In [32]:
xgb_rscv = RandomizedSearchCV(estimator=final_pipeline, scoring='neg_mean_squared_error', param_distributions=gbm_param, n_iter=20, n_jobs=-1)

In [33]:
xgb_rscv.fit(housing, housing.median_house_value)

In [None]:
xgb_rscv.get_params()

In [35]:
xgb_rscv.predict(housing)

array([452665.4 , 359415.06, 352353.6 , ...,  92256.25,  85365.79,
        89027.45], dtype=float32)

In [36]:
from joblib import load, dump

In [37]:
dump(xgb_rscv, 'file_xgb_rscv.joblib')

['file_xgb_rscv.joblib']

In [38]:
saved_model = load('file_xgb_rscv.joblib')

In [39]:
saved_model.predict(housing)

array([452665.4 , 359415.06, 352353.6 , ...,  92256.25,  85365.79,
        89027.45], dtype=float32)

In [40]:
import pickle

In [41]:
with open('pipeline_model', 'wb') as file:
    pickle.dump(xgb_rscv, file)

In [42]:
pwd()

'E:\\Personal\\IITKAIML\\IITKAIML\\ML'