In [3]:
import csv
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier


def evaluate_model(dt_classifier, x, y, l, d):
    y_pred=dt_classifier.predict(x)
    print("Test Accuracy for Dataset:", d, "and", l, "is", accuracy_score(y, y_pred))
    print("Test Confusion Matrix:")
    print(confusion_matrix(y, y_pred))
    print("F1 Score for Dataset:", d, "and", l, "is", f1_score(y, y_pred, average=None))


for i in ['_c300', '_c500', '_c1000', '_c1500', '_c1800']:
  for j in ['_d100', '_d1000', '_d5000']:
    for l in ['tree', 'bagging', 'randomForest', 'gradientBoosting']:
      grid_search=None
      params=None
      dt_best=None

      if l=='tree':
        params = {
        'max_depth': [2, 3, 5, 10, 20, 40, 80, 200],
        'min_samples_leaf': [5, 10, 20, 50, 100],
        'criterion': ["gini", "entropy"],
        'max_features': [2, 3, 200, 300, 400, 500],
        'splitter': ["best", "random"]
        }

        dt = DecisionTreeClassifier(random_state=42)
        grid_search = GridSearchCV(estimator=dt, param_grid=params, cv=4, n_jobs=-1, scoring = "accuracy")

      if l == 'bagging':
        params = {
        'bootstrap': [True, False],   
        'n_estimators': [10, 35, 80, 280],
        'base_estimator__max_depth': [2, 3, 20],
        'base_estimator__min_samples_leaf': [5, 20, 50],
        'base_estimator__max_features': [2, 3, 50, 300],
        }
        grid_search = GridSearchCV(BaggingClassifier(base_estimator=dt, random_state=42), param_grid=params, cv=3, scoring = "accuracy")

      if l=='randomForest':
        params= {
        'bootstrap': [True],
        'max_depth': [5, 10, 30, 90, 110],
        'max_features': [2, 3, 200, 300],
        'n_estimators': [100, 200]
        }
        rf = RandomForestClassifier(random_state=42)
        grid_search = GridSearchCV(estimator=rf, param_grid=params, cv=5, scoring = "accuracy")

      if l=='gradientBoosting':
        gbc = GradientBoostingClassifier()
        params = {
          "n_estimators":[5,20, 50, 250],
          "max_depth":[1,3,5,7,9],
          "learning_rate":[0.01,0.1,1,10,20]
        }
        grid_search = GridSearchCV(estimator=gbc, param_grid=params, cv=5, scoring = "accuracy")

      #add path to the datasets here
      read_train = pd.read_csv(r'/content/drive/MyDrive/hw3_part1_data/all_data/'+'train'+i+j+'.csv', header=None)
      read_valid = pd.read_csv(r'/content/drive/MyDrive/hw3_part1_data/all_data/'+'valid'+i+j+'.csv', header=None)
      read_test = pd.read_csv(r'/content/drive/MyDrive/hw3_part1_data/all_data/'+'test'+i+j+'.csv', header=None)

      read_valid = shuffle(read_valid)
      X_valid=read_valid.drop(500, axis=1)
      Y_valid=read_valid[500]
      grid_search.fit(X_valid, Y_valid)

      dt_best = grid_search.best_estimator_
      print("Best Estimator", dt_best)

      X_train=read_train.drop(500, axis=1)
      Y_train=read_train[500]
      X = pd.concat([X_train, X_valid])
      Y = pd.concat([Y_train, Y_valid])
      dt_best.fit(X, Y)
        
      X_test=read_test.drop(500, axis=1)
      Y_test=read_test[500]
      evaluate_model(dt_best, X_test, Y_test, l, 'test'+i+j)
      print("\n")
    print("\n")

Best Estimator RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features=2,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
Test Accuracy for Dataset: test_c1800_d5000 and randomForest is 1.0
Test Confusion Matrix:
[[5000    0]
 [   0 5000]]
F1 Score for Dataset: test_c1800_d5000 and randomForest is [1. 1.]


Best Estimator GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=1,
                           max_features=None, max_leaf_nodes=None,
                        

In [None]:
from sklearn.ensemble import BaggingClassifier
param_grid = {
 'bootstrap': [True, False],
 'bootstrap_features': [True, False],    
 'n_estimators': [5, 10, 15, 20],
 'max_samples' : [0.6, 0.8, 1.0],
 'base_estimator__bootstrap': [True, False],    
 'base_estimator__n_estimators': [100, 200, 300],
 'base_estimator__max_features' : [0.6, 0.8, 1.0]
}

grid_search_bagging=GridSearchCV(BaggingClassifier(base_estimator=dt), param_grid=param_grid, cv=5, scoring = "accuracy")

for i in ['300', '500', '1000', '1500', '1800']:
  for j in ['_d100', '_d1000', '_d5000']:
    for k in ['valid_c', 'train_c', 'test_c']:
      if k=='valid_c':
        print(i, j, k)
        read =  pd.read_csv(r'/content/drive/MyDrive/hw3_part1_data/all_data/'+k+i+j+'.csv', header=None)
        read = shuffle(read)
        X_valid=read.drop(500, axis=1)
        Y_valid=read[500]

        grid_search.fit(X_valid, Y_valid)
        score_df = pd.DataFrame(grid_search.cv_results_)
        dt_best = grid_search.best_estimator_
        print("Best Estimator", dt_best)

      if k=='train_c':
        read =  pd.read_csv(r'/content/drive/MyDrive/hw3_part1_data/all_data/'+k+i+j+'.csv', header=None)
      
        X_train=read.drop(500, axis=1)
        Y_train=read[500]
        X = pd.concat([X_train, X_valid])
        Y = pd.concat([Y_train, Y_valid])
        dt_best.fit(X, Y)
      
      if k=='test_c':
        read =  pd.read_csv(r'/content/drive/MyDrive/hw3_part1_data/all_data/'+k+i+j+'.csv', header=None)
      
        X_test=read.drop(500, axis=1)
        Y_test=read[500]
        evaluate_model(dt_best, X_test, Y_test)
