# Complete Notebook
## Module Loader

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set()


## Analysis (General)
### Dataloader

In [2]:
path = r"data\train.csv"
X_raw = pd.read_csv(path, index_col="Id")
X_test = pd.read_csv(path, index_col ="Id")


### General analysis

In [174]:
X = X_raw.copy()
X.dropna(subset=["SalePrice"], axis=0, inplace=True)
X.dropna(axis=1, inplace = True)
y = X.SalePrice
X.drop("SalePrice", axis=1, inplace=True)

print(X.head())
print(X.info())
print(X.describe())

col_obj = X.select_dtypes(include=["object", "bool"]).columns
col_obj_above_10 = list([col for col in col_obj if X[col].nunique() > 15])
col_obj_below_10 = list(set(col_obj)-set(col_obj_above_10))
col_num = X.select_dtypes(exclude=["object", "bool"]).columns.values
col_with_nan = [col for col in X.columns if X[col].isna().any()]


print("Summary:\n")
print("Number of columns: {}".format(len(X.columns)))
print("Number of object columns: {}".format(len(col_obj)))
print("Number of columns with NaNs: {}".format(len(col_with_nan)))
print("Number of object columns with NaNs: {}".format(len(set(col_with_nan)-set(col_num))))
print("Number of numeric columns with NaNs: {}".format(len(set(col_with_nan)-set(col_obj))))


MSSubClass MSZoning  LotArea Street LotShape LandContour Utilities  \
Id                                                                       
1           60       RL     8450   Pave      Reg         Lvl    AllPub   
2           20       RL     9600   Pave      Reg         Lvl    AllPub   
3           60       RL    11250   Pave      IR1         Lvl    AllPub   
4           70       RL     9550   Pave      IR1         Lvl    AllPub   
5           60       RL    14260   Pave      IR1         Lvl    AllPub   

   LotConfig LandSlope Neighborhood  ... OpenPorchSF EnclosedPorch 3SsnPorch  \
Id                                   ...                                       
1     Inside       Gtl      CollgCr  ...          61             0         0   
2        FR2       Gtl      Veenker  ...           0             0         0   
3     Inside       Gtl      CollgCr  ...          42             0         0   
4     Corner       Gtl      Crawfor  ...          35           272         0   
5    

### Further Analysis

## Pipeline and predictor

In [98]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lars, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error



### Building the pipeline

In [145]:
res= {}

In [175]:
reg = XGBRegressor(early_stopping_counts=50, evals=([X_valid, y_valid]), eval_metric="mae", cv=True, seed=0 )
#reg = ElasticNet()

X_train, X_valid, y_train, y_valid = train_test_split( X, y, train_size = 0.8, test_size=0.2, random_state=0)



#numeric_transformer = Pipeline(["imputer",SimpleImputer(strategy="median")])



#raising value error due to lack in synchronization caused by OneHotEncoder and OrdinalEncoder
preprocessing = ColumnTransformer(transformers=[("numeric", SimpleImputer(strategy = "median"), col_num) ,
                                            ("obj_below_10", Pipeline([("obj_imputer1", SimpleImputer(strategy = "most_frequent")),
                                                                        ("OHE"            , OneHotEncoder(handle_unknown='ignore'
                                                                                                            ))
                                                                        ]), col_obj_below_10) ,
                                            ("Neighborhood_", Pipeline([("obj_imputer2", SimpleImputer(strategy = "most_frequent")),
                                                                        ("label_encoder"  , OrdinalEncoder(categories=[list(X["Neighborhood"].unique())])
                                                                                                            )
                                                                        ]), ["Neighborhood"]),
                                            ("Exterior_2nd", Pipeline([("obj_imputer2", SimpleImputer(strategy = "most_frequent")),
                                                                        ("label_encoder"  , OrdinalEncoder(categories=[list(X["Exterior2nd"].unique())]))
                                                                        ]), ["Exterior2nd"])], remainder='drop')


my_pipe = Pipeline([("preprocessing",preprocessing),
                     #("pca", PCA(n_components = 35)),
                     ("std_scaler", StandardScaler()),
                     ("reg",reg)])


param_grid = {  #"preprocessing__numeric__strategy":["median", "mean"],
                #"pca__n_components": np.linspace(25, 65, 5, dtype="int"),
                "reg__n_estimators": [100],
                "reg__learning_rate": [0.1],
                "reg__max_depth": [4],
                "reg__min_child_weight":[1.1],
                "reg__gamma":[0],
                "reg__subsample":[0.8367346938775511],
                "reg__colsample_bytree": [0.5510204081632653],
                "reg__alpha":[0.11],
                
                }
cv = 3

search = GridSearchCV(my_pipe, param_grid=param_grid, scoring="neg_mean_absolute_error", verbose=10, n_jobs=-2, cv = cv, return_train_score=True)
search.fit(X_train,y_train)
print(abs(search.best_score_))
search_pred = search.predict(X_valid)
search_score = mean_absolute_error(y_valid, search_pred)
print(search.best_params_)

res[search.best_score_] = search.best_params_

test_preds = search.predict(X_test)
output = pd.DataFrame({"Id": X_test.index,
                      "SalePrice": test_preds})
output.to_csv("submission_other.csv", index=False)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   3 out of   3 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=-2)]: Done   3 out of   3 | elapsed:    0.6s finished
15657.596538549084
{'reg__alpha': 0.11, 'reg__colsample_bytree': 0.5510204081632653, 'reg__gamma': 0, 'reg__learning_rate': 0.1, 'reg__max_depth': 4, 'reg__min_child_weight': 1.1, 'reg__n_estimators': 100, 'reg__subsample': 0.8367346938775511}



Choose a relatively high learning rate. Generally a learning rate of 0.1 works but somewhere between 0.05 to 0.3 should work for different problems. Determine the optimum number of trees for this learning rate. XGBoost has a very useful function called as “cv” which performs cross-validation at each boosting iteration and thus returns the optimum number of trees required.
Tune tree-specific parameters ( max_depth, min_child_weight, gamma, subsample, colsample_bytree) for decided learning rate and number of trees. Note that we can choose different parameters to define a tree and I’ll take up an example here.
Tune regularization parameters (lambda, alpha) for xgboost which can help reduce model complexity and enhance performance.
Lower the learning rate and decide the optimal parameters .


In [128]:
print(np.linspace(0.1, 2, 10))

[0.1        0.31111111 0.52222222 0.73333333 0.94444444 1.15555556
 1.36666667 1.57777778 1.78888889 2.        ]


In [None]:
from sklearn.compose import Tar

#reg = XGBRegressor(early_stopping_counts=5, eval_stop=([X_valid, y_valid]) )
reg = ElasticNet()

X_train, X_valid, y_train, y_valid = train_test_split( X, y, train_size = 0.8, test_size=0.2, random_state=0)



#numeric_transformer = Pipeline(["imputer",SimpleImputer(strategy="median")])



#raising value error due to lack in synchronization caused by OneHotEncoder and OrdinalEncoder
preprocessing = ColumnTransformer(transformers=[("numeric", SimpleImputer(strategy = "median"), col_num) ,
                                            ("obj_below_10", Pipeline([("obj_imputer1", SimpleImputer(strategy = "most_frequent")),
                                                                        ("OHE"            , OneHotEncoder(handle_unknown='ignore'
                                                                                                            ))
                                                                        ]), col_obj_below_10) ,
                                            ("Neighborhood_", Pipeline([("obj_imputer2", SimpleImputer(strategy = "most_frequent")),
                                                                        ("label_encoder"  , OrdinalEncoder(categories=[list(X["Neighborhood"].unique())])
                                                                                                            )
                                                                        ]), ["Neighborhood"]),
                                            ("Exterior_2nd", Pipeline([("obj_imputer2", SimpleImputer(strategy = "most_frequent")),
                                                                        ("label_encoder"  , OrdinalEncoder(categories=[list(X["Exterior2nd"].unique())]))
                                                                        ]), ["Exterior2nd"])], remainder='drop')


my_pipe = Pipeline([("preprocessing",preprocessing),
                     ("pca", PCA(n_components = 42)),
                     ("reg",reg)])


param_grid = {  #"preprocessing__numeric__strategy":["median", "mean"],
                "pca__n_components": np.linspace(25, 65, 10, dtype="int"),
                #"reg__n_estimators": np.linspace(100, 1000, 20, dtype="int"),
                "reg__alpha":np.linspace(.01,1.5,50),
                "reg__l1_ratio": np.linspace(.2, .8, 10)
                }
cv = 5

search = GridSearchCV(my_pipe, param_grid=param_grid, scoring="neg_mean_absolute_error", verbose=5, n_jobs=-2, cv = cv)
search.fit(X_train,y_train)
print(abs(search.best_score_))
search_pred = search.predict(X_valid)
search_score = mean_absolute_error(y_valid, search_pred)
print(search.best_params_)

reg = ElasticNet(alpha=.253265, l1_ratio=0.66666666)
my_pipe = Pipeline([("preprocessing",preprocessing),
                     ("pca", PCA(n_components = 42)),
                     ("reg",reg)])
my_pipe.fit(X_train, y_train)
y_pred = my_pipe.predict(X_valid)
score = mean_absolute_error(y_valid, y_pred)
print(score)

In [55]:
rfr, lars, elnet, svr, ada, mlpr, xgbr = RandomForestRegressor(), LassoLars(), ElasticNet(), SVR(), AdaBoostRegressor(), MLPRegressor(), XGBRegressor()

model_instances = [rfr, lars, elnet, svr, ada, mlpr, xgbr]
rfr_list =["n_estimators"]
rfr_list.append(np.linspace(10, 200, 10, dtype="int"))
lars_list = ["alpha"]
lars_list.append(np.linspace(.01, .2, 10))
elnet = 

models = {}
grids = []
print(rfr_list)

SyntaxError: invalid syntax (<ipython-input-55-a0b1e13b0fb7>, line 8)