# Complete Notebook
## Module Loader

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set()


## Analysis (General)
### Dataloader

In [2]:
path = r"data\train.csv"
X_raw = pd.read_csv(path, index_col="Id")
X_test = pd.read_csv(path, index_col ="Id")


### General analysis

In [3]:
X = X_raw.copy()
X.dropna(subset=["SalePrice"], axis=0, inplace=True)
#X.dropna(axis=1, inplace = True)
y = X.SalePrice
X.drop("SalePrice", axis=1, inplace=True)

print(X.head())
print(X.info())
print(X.describe())

col_obj = X.select_dtypes(include=["object", "bool"]).columns
col_obj_above_10 = list([col for col in col_obj if X[col].nunique() > 15])
col_obj_below_10 = list(set(col_obj)-set(col_obj_above_10))
col_num = X.select_dtypes(exclude=["object", "bool"]).columns.values
col_with_nan = [col for col in X.columns if X[col].isna().any()]


print("Summary:\n")
print("Number of columns: {}".format(len(X.columns)))
print("Number of object columns: {}".format(len(col_obj)))
print("Number of columns with NaNs: {}".format(len(col_with_nan)))
print("Number of object columns with NaNs: {}".format(len(set(col_with_nan)-set(col_num))))
print("Number of numeric columns with NaNs: {}".format(len(set(col_with_nan)-set(col_obj))))


MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
Id                                                                    
1           60       RL         65.0     8450   Pave   NaN      Reg   
2           20       RL         80.0     9600   Pave   NaN      Reg   
3           60       RL         68.0    11250   Pave   NaN      IR1   
4           70       RL         60.0     9550   Pave   NaN      IR1   
5           60       RL         84.0    14260   Pave   NaN      IR1   

   LandContour Utilities LotConfig  ... ScreenPorch PoolArea PoolQC Fence  \
Id                                  ...                                     
1          Lvl    AllPub    Inside  ...           0        0    NaN   NaN   
2          Lvl    AllPub       FR2  ...           0        0    NaN   NaN   
3          Lvl    AllPub    Inside  ...           0        0    NaN   NaN   
4          Lvl    AllPub    Corner  ...           0        0    NaN   NaN   
5          Lvl    AllPub       FR2  ...     

### Further Analysis

## Pipeline and predictor

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lars, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error



### Building the pipeline

In [8]:
res= {}
np.random.seed(1)

In [55]:

#reg = ElasticNet()

X_train, X_valid, y_train, y_valid = train_test_split( X, y, train_size = 0.8, test_size=0.2, random_state=0)
reg = XGBRegressor(evals=([X_valid, y_valid]), eval_metric="mae", cv=True, seed=0 )


#numeric_transformer = Pipeline(["imputer",SimpleImputer(strategy="median")])



#raising value error due to lack in synchronization caused by OneHotEncoder and OrdinalEncoder
preprocessing = ColumnTransformer(transformers=[("numeric", SimpleImputer(strategy = "median"), col_num) ,
                                            ("obj_below_10", Pipeline([("obj_imputer1", SimpleImputer(strategy = "most_frequent")),
                                                                        ("OHE"            , OneHotEncoder(handle_unknown='ignore'
                                                                                                            ))
                                                                        ]), col_obj_below_10) ,
                                            ("Neighborhood_", Pipeline([("obj_imputer2", SimpleImputer(strategy = "most_frequent")),
                                                                        ("label_encoder"  , OrdinalEncoder(categories=[list(X["Neighborhood"].unique())])
                                                                                                            )
                                                                        ]), ["Neighborhood"]),
                                            ("Exterior_2nd", Pipeline([("obj_imputer2", SimpleImputer(strategy = "most_frequent")),
                                                                        ("label_encoder"  , OrdinalEncoder(categories=[list(X["Exterior2nd"].unique())]))
                                                                        ]), ["Exterior2nd"])], remainder='drop')


my_pipe = Pipeline([("preprocessing",preprocessing),
                     #("pca", PCA(n_components = 35)),
                     ("std_scaler", StandardScaler()),
                     ("reg",reg)])


param_grid = {  #"preprocessing__numeric__strategy":["median", "mean"],
                #"pca__n_components": np.linspace(25, 65, 5, dtype="int"),
                "reg__early_stopping_counts": [50],
                "reg__n_estimators": [342],
                "reg__learning_rate":[0.04002371183530562],
                "reg__max_depth": [5],
                "reg__min_child_weight": [1.02],
                "reg__gamma":[0],
                "reg__subsample":[0.8264488486588846],
                "reg__colsample_bytree": [0.551988011711525],
                "reg__alpha":[0.107],
                
                }
cv = 5

search = GridSearchCV(my_pipe, param_grid=param_grid, scoring="neg_mean_absolute_error", verbose=10, n_jobs=-2, cv = cv, return_train_score=True)
search.fit(X_train,y_train)
print(abs(search.best_score_))
search_pred = search.predict(X_valid)
search_score = mean_absolute_error(y_valid, search_pred)
print(search.best_params_)

res[search.best_score_] = search.best_params_

test_preds = search.predict(X_test)
output = pd.DataFrame({"Id": X_test.index,
                     "SalePrice": test_preds})
output.to_csv("submission_other.csv", index=False)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   4 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-2)]: Done  11 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-2)]: Done  18 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-2)]: Done  27 tasks      | elapsed:   19.2s
[Parallel(n_jobs=-2)]: Done  36 tasks      | elapsed:   28.6s
[Parallel(n_jobs=-2)]: Done  47 tasks      | elapsed:   33.6s
[Parallel(n_jobs=-2)]: Done  58 tasks      | elapsed:   42.8s
[Parallel(n_jobs=-2)]: Done  71 tasks      | elapsed:   52.5s
[Parallel(n_jobs=-2)]: Done  84 tasks      | elapsed:   58.3s
[Parallel(n_jobs=-2)]: Done  98 out of 100 | elapsed:  1.1min remaining:    1.3s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:  1.2min finished
15075.204376601989
{'reg__alpha': 0.107, 'reg__colsample_bytree': 0.551988011711525, 'reg__early_stopping_counts': 50, 'reg__gamma': 0, 'reg__learning_r

In [38]:
subsample= 0.8265340078198108
subsample=0.8264608951160635
subsample_best = 0.8264488486588846
colsample_0 = 0.5519746590351778
colsample_best= 0.551988011711525
n_est = 349
n_est = 343
print(np.random.normal(0.8265340078198108, 0.001, 20))

[0.82613313 0.82735801 0.8259717  0.82848889 0.82520206 0.82477332
 0.82488329 0.82564345 0.82541489 0.82849009 0.82620751 0.82519133
 0.82764839 0.82594748 0.82529715 0.82740985 0.82715737 0.82609905
 0.82794155 0.82666311]


In [41]:
np.random.normal(0.5510204081632653, 0.1,20)

array([0.56164268, 0.39845238, 0.63052302, 0.51357658, 0.56442523,
       0.67122589, 0.57949522, 0.57726715, 0.57867034, 0.47769325,
       0.63462088, 0.70535632, 0.62690097, 0.63951129, 0.46329226,
       0.46424169, 0.40693281, 0.67424572, 0.52560242, 0.6910048 ])


Choose a relatively high learning rate. Generally a learning rate of 0.1 works but somewhere between 0.05 to 0.3 should work for different problems. Determine the optimum number of trees for this learning rate. XGBoost has a very useful function called as “cv” which performs cross-validation at each boosting iteration and thus returns the optimum number of trees required.
Tune tree-specific parameters ( max_depth, min_child_weight, gamma, subsample, colsample_bytree) for decided learning rate and number of trees. Note that we can choose different parameters to define a tree and I’ll take up an example here.
Tune regularization parameters (lambda, alpha) for xgboost which can help reduce model complexity and enhance performance.
Lower the learning rate and decide the optimal parameters .


from sklearn.compose import Tar

#reg = XGBRegressor(early_stopping_counts=5, eval_stop=([X_valid, y_valid]) )
reg = ElasticNet()

X_train, X_valid, y_train, y_valid = train_test_split( X, y, train_size = 0.8, test_size=0.2, random_state=0)



#numeric_transformer = Pipeline(["imputer",SimpleImputer(strategy="median")])



#raising value error due to lack in synchronization caused by OneHotEncoder and OrdinalEncoder
preprocessing = ColumnTransformer(transformers=[("numeric", SimpleImputer(strategy = "median"), col_num) ,
                                            ("obj_below_10", Pipeline([("obj_imputer1", SimpleImputer(strategy = "most_frequent")),
                                                                        ("OHE"            , OneHotEncoder(handle_unknown='ignore'
                                                                                                            ))
                                                                        ]), col_obj_below_10) ,
                                            ("Neighborhood_", Pipeline([("obj_imputer2", SimpleImputer(strategy = "most_frequent")),
                                                                        ("label_encoder"  , OrdinalEncoder(categories=[list(X["Neighborhood"].unique())])
                                                                                                            )
                                                                        ]), ["Neighborhood"]),
                                            ("Exterior_2nd", Pipeline([("obj_imputer2", SimpleImputer(strategy = "most_frequent")),
                                                                        ("label_encoder"  , OrdinalEncoder(categories=[list(X["Exterior2nd"].unique())]))
                                                                        ]), ["Exterior2nd"])], remainder='drop')


my_pipe = Pipeline([("preprocessing",preprocessing),
                     ("pca", PCA(n_components = 42)),
                     ("reg",reg)])


param_grid = {  #"preprocessing__numeric__strategy":["median", "mean"],
                "pca__n_components": np.linspace(25, 65, 10, dtype="int"),
                #"reg__n_estimators": np.linspace(100, 1000, 20, dtype="int"),
                "reg__alpha":np.linspace(.01,1.5,50),
                "reg__l1_ratio": np.linspace(.2, .8, 10)
                }
cv = 5

search = GridSearchCV(my_pipe, param_grid=param_grid, scoring="neg_mean_absolute_error", verbose=5, n_jobs=-2, cv = cv)
search.fit(X_train,y_train)
print(abs(search.best_score_))
search_pred = search.predict(X_valid)
search_score = mean_absolute_error(y_valid, search_pred)
print(search.best_params_)

reg = ElasticNet(alpha=.253265, l1_ratio=0.66666666)
my_pipe = Pipeline([("preprocessing",preprocessing),
                     ("pca", PCA(n_components = 42)),
                     ("reg",reg)])
my_pipe.fit(X_train, y_train)
y_pred = my_pipe.predict(X_valid)
score = mean_absolute_error(y_valid, y_pred)
print(score)

rfr, lars, elnet, svr, ada, mlpr, xgbr = RandomForestRegressor(), LassoLars(), ElasticNet(), SVR(), AdaBoostRegressor(), MLPRegressor(), XGBRegressor()

model_instances = [rfr, lars, elnet, svr, ada, mlpr, xgbr]
rfr_list =["n_estimators"]
rfr_list.append(np.linspace(10, 200, 10, dtype="int"))
lars_list = ["alpha"]
lars_list.append(np.linspace(.01, .2, 10))
elnet = 

models = {}
grids = []
print(rfr_list)