In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
file_url =r'https://raw.githubusercontent.com/sedeba19/Chapter-4/main/data/Dataset_openml_phpZNNasq.csv'

In [3]:
df = pd.read_csv(file_url)

In [4]:
y = df.pop('class')

X_train, X_test, y_train, y_test = train_test_split(df,
                                                    y,
                                                    test_size= 0.3,
                                                    random_state=123)

In [7]:
# Create a model function
def train_rf (X_train, y_train, random_state = 123,
              n_estimators = 10, max_dept = None,
              min_samples_leaf = 1, max_features = 'sqrt'):
    rf_model = RandomForestClassifier(random_state=random_state,
                                      n_estimators=n_estimators,
                                      max_depth=max_dept,
                                      max_features=max_features)
    rf_model.fit(X_train, y_train)
    return rf_model

In [9]:
rf_1 = train_rf(X_train, y_train)
rf_1.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 123,
 'verbose': 0,
 'warm_start': False}

In [10]:
# Create a function for prediction
def get_preds(rf_model, X_train, X_test):
    train_preds = rf_model.predict(X_train)
    test_preds = rf_model.predict(X_test)
    return train_preds, test_preds

In [11]:
trn_preds, tst_preds = get_preds(rf_1, X_train, X_test)

In [13]:
# Create a function for accuracy
def print_acc(y_train, y_test, trn_preds, tst_preds):
    train_acc = accuracy_score(y_train, trn_preds)
    test_acc = accuracy_score(y_test, tst_preds)
    print(train_acc)
    print(test_acc)
    return train_acc, test_acc

In [14]:
trn_acc, tst_preds = print_acc(y_train, y_test, trn_preds, tst_preds)

0.998533993036467
0.9012820512820513


In [18]:
# Create a function that will return model trained, predictions, and accuracy scores for training and testing sets
def fit_predict_rf(X_train, X_test, y_train, y_test,
                   random_state = 123, n_estimators = 10,
                   max_depth = None, min_sample_leaf = 1,
                   max_features = 'sqrt'):
    rf_model = train_rf(X_train, y_train,
                        random_state=123,
                        n_estimators=n_estimators,
                        max_dept=max_depth,
                        min_samples_leaf=min_sample_leaf,
                        max_features=max_features)
    train1_preds, test1_preds = get_preds(rf_model, X_train, X_test)
    train_acc, test_acc = print_acc(y_train, y_test, train1_preds, test1_preds)
    return rf_model, train1_preds, test1_preds, train_acc, test_acc    


In [19]:
rf_model_1, trn_preds_1, tst_preds_1, trn_acc_1, tst_acc_1 = fit_predict_rf(X_train, 
                                                                            X_test, 
                                                                            y_train, 
                                                                            y_test, 
                                                                            random_state=123, 
                                                                            n_estimators=20, 
                                                                            max_depth=None, 
                                                                            min_sample_leaf=1, 
                                                                            max_features='sqrt')

1.0
0.9217948717948717


In [21]:
rf_model_2, trn_preds_2, tst_preds_2, \
trn_acc_2, tst_acc_2 = fit_predict_rf(X_train, X_test, y_train,
                                      y_test, random_state=888,
                                      n_estimators=50,
                                      max_depth=None, 
                                      min_sample_leaf=1, 
                                      max_features='sqrt')

1.0
0.9320512820512821


In [22]:
rf_model_3, trn_preds_3, tst_preds_3, \
trn_acc_3, tst_acc_3 = fit_predict_rf(X_train, X_test, y_train,
                                      y_test, random_state=888,
                                      n_estimators=50, 
                                      max_depth=5, 
                                      min_sample_leaf=1, 
                                      max_features='sqrt')

0.8715411398204141
0.8491452991452991


In [23]:
rf_model_4, trn_preds_4, tst_preds_4, \
trn_acc_4, tst_acc_4 = fit_predict_rf(X_train, X_test, y_train,
                                      y_test, random_state=888,
                                      n_estimators=50, 
                                      max_depth=10, 
                                      min_sample_leaf=1, 
                                      max_features='sqrt')

0.9844236760124611
0.9290598290598291


In [25]:
rf_model_5, trn_preds_5, tst_preds_5, \
trn_acc_5, tst_acc_5 = fit_predict_rf(X_train, X_test, y_train,
                                      y_test, random_state=888,
                                      n_estimators=50, 
                                      max_depth=10, 
                                      min_sample_leaf=10, 
                                      max_features='sqrt')

0.9844236760124611
0.9290598290598291


In [26]:
rf_model_6, trn_preds_6, tst_preds_6, \
trn_acc_6, tst_acc_6 = fit_predict_rf(X_train, X_test, y_train,
                                      y_test, random_state=888,
                                      n_estimators=50, 
                                      max_depth=10, 
                                      min_sample_leaf=50, 
                                      max_features='sqrt')

0.9844236760124611
0.9290598290598291


In [27]:
rf_model_7, trn_preds_7, tst_preds_7, \
trn_acc_7, tst_acc_7 = fit_predict_rf(X_train, X_test, y_train,
                                      y_test, random_state=888,
                                      n_estimators=50, 
                                      max_depth=10, 
                                      min_sample_leaf=50,
                                      max_features=0.5)

0.9756276342312626
0.9166666666666666


In [28]:
rf_model_8, trn_preds_8, tst_preds_8, \
trn_acc_8, tst_acc_8 = fit_predict_rf(X_train, X_test, y_train,
                                      y_test, random_state=888,
                                      n_estimators=50, 
                                      max_depth=10, 
                                      min_sample_leaf=50, 
                                      max_features=0.3)

0.98094190947407
0.9205128205128205
