In [206]:
#gradient boosting
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler,LabelEncoder
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
from sklearn.model_selection import RandomizedSearchCV

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [11]:
df = pd.read_csv("mushrooms.csv")

In [13]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [80]:
y = df['class']
X = df.drop('class',axis=1)

In [98]:
X = pd.get_dummies(X,drop_first=True)
X = X.astype(int)
X.head()

Unnamed: 0,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_g,cap-surface_s,cap-surface_y,cap-color_c,cap-color_e,...,population_n,population_s,population_v,population_y,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,0,0,0,1,0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,0,0,0,0,1,0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0
3,0,0,0,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
4,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [100]:
y = pd.get_dummies(y,drop_first=True)
y = y.astype(int)
y.head()

Unnamed: 0,p
0,1
1,0
2,0
3,1
4,0


In [102]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [54]:
# Create Column Transformer with 3 types of transformers
#num_features = X.select_dtypes(exclude="object").columns
#onehot_columns = ['','','']

#numeric_transformer = StandardScaler()
#oh_transformer = OneHotEncoder(drop='first')

#preprocessor = ColumnTransformer(
   # [
 #       ("OneHotEncoder", oh_transformer, onehot_columns),
  #      ("StandardScaler", numeric_transformer, num_features)
        
   # ],remainder='passthrough'
    
#)

In [106]:
models = {
    'Linear Regression': LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Gradient BoostRegressor":GradientBoostingRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Adaboost Regressor":AdaBoostRegressor(),
    "Xgboost Regressor":XGBRegressor()
}

In [153]:
def model_eval(y_actual, y_pred):
    mae = mean_absolute_error(y_actual, y_pred)
    rmse = np.sqrt(mean_squared_error(y_actual, y_pred))
    r2_square = r2_score(y_actual, y_pred)
    return mae, rmse, r2_square

In [185]:
def model_run(models, X_train, X_test, y_train, y_test):
    results = []
    for model_name, model in models.items():
        model.fit(X_train, y_train) # Train model    
    #make predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
    #eval both datasets
        train_mae ,train_rmse, train_r2 = model_eval(y_train, y_train_pred)
        test_mae , test_rmse, test_r2 = model_eval(y_test, y_test_pred)          
    #print results:
        results.append({
            'Model': model_name,
            'Train MAE': train_mae,
            'Train RMSE': train_rmse,
            'Train R²': train_r2,
            'Model': model_name,
            'Test MAE': test_mae,
            'Test RMSE': test_rmse,
            'Test R²': test_r2
        })
    results_df = pd.DataFrame(results)
    print(results_df)
    return results_df

In [187]:
results_df = model_run(models, X_train, X_test, y_train, y_test)

                     Model     Train MAE    Train RMSE  Train R²  \
0        Linear Regression  2.787048e-15  3.709045e-15  1.000000   
1                    Lasso  4.994877e-01  4.997438e-01  0.000000   
2                    Ridge  6.415499e-03  1.799907e-02  0.998703   
3    K-Neighbors Regressor  3.517411e-05  2.652324e-03  0.999972   
4            Decision Tree  0.000000e+00  0.000000e+00  1.000000   
5  Gradient BoostRegressor  9.716098e-03  3.348299e-02  0.995511   
6  Random Forest Regressor  7.562434e-05  2.765930e-03  0.999969   
7       Adaboost Regressor  1.478256e-02  2.675549e-02  0.997134   
8        Xgboost Regressor  3.129391e-05  9.208779e-05  1.000000   

       Test MAE     Test RMSE   Test R²  
0  2.764310e-15  3.696627e-15  1.000000  
1  4.992779e-01  4.995338e-01 -0.000172  
2  7.067483e-03  2.205690e-02  0.998050  
3  8.203445e-05  4.050542e-03  0.999934  
4  0.000000e+00  0.000000e+00  1.000000  
5  1.047151e-02  3.882850e-02  0.993957  
6  1.476620e-04  4.324795

In [208]:
#pick best performing models and select tuning hyperparameters
randomcv_models = [
                   ("RF", RandomForestRegressor(), rf_params),
                   ("XGboost",XGBRegressor(),xgboost_params)
                   
                   ]

In [210]:
#select tuning hyperparameters
rf_params = {"max_depth": [5, 10, 20, None],
             "max_features": ['log2', 10, 20, "auto"],
             "min_samples_split": [2, 5, 10, 20],
             "n_estimators": [100, 250, 500, 1000]}

xgboost_params = {"learning_rate": [0.1, 0.01],
                  "max_depth": [5, 8, 12, 20, 30],
                  "n_estimators": [100, 200, 300],
                  "colsample_bytree": [0.5, 0.8, 1, 0.3, 0.4]}

In [218]:
model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=3,
                                   verbose=1,
                                   n_jobs=-1)
    y_train = np.ravel(y_train)
    y_test = np.ravel(y_test)
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_
for model_name in model_param:
    print(f"Best Parameters for {model_name} :")
    print(model_param[model_name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best Parameters for RF :
{'n_estimators': 500, 'min_samples_split': 5, 'max_features': 20, 'max_depth': None}
Best Parameters for XGboost :
{'n_estimators': 300, 'max_depth': 8, 'learning_rate': 0.1, 'colsample_bytree': 0.3}


In [None]:
#retrain models with best parameters