# Importing required libraries.

In [101]:
from os import listdir
import pandas as pd
import math
from time import time
from tqdm import tqdm
from collections import defaultdict
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split

# Preprocessing CSV dataset to dictionaries of dfs.

In [8]:
filedir = "/content/drive/MyDrive/all_data/"
datasets = defaultdict(lambda: defaultdict(lambda: defaultdict()))
for filename in listdir(filedir):
  typ, clauses, size = filename.split('_')
  datasets[int(clauses[1:])][int(size[1:-4])][typ] = pd.read_csv(filedir+"/"+filename, sep = ',', header = None)

# Decision Tree Classifier

In [123]:
for num_clauses in [300, 500, 1000, 1500, 1800]:
  for data_size in [100, 1000, 5000]:
    st = time()
    trainX, trainy = datasets[num_clauses][data_size]['train'].iloc[:,:-1], datasets[num_clauses][data_size]['train'].iloc[:,-1]
    validX, validy = datasets[num_clauses][data_size]['valid'].iloc[:,:-1], datasets[num_clauses][data_size]['valid'].iloc[:,-1]
    
    print('===========================================================================')
    print('c',num_clauses,'d',data_size)
    print('===========================================================================')

    print("Finding optimal parameters:")
    #Optimizing on depth
    store_accs2 = []
    with tqdm(total=3, position=0, leave=True) as pbar:
      for attr in ['sqrt', 'log2', None]:
        for try_reduce_depth in range(data_size//50, int(data_size//2),data_size//50):
          clf = DecisionTreeClassifier(random_state = 0, max_features=attr, min_samples_leaf=try_reduce_depth).fit(trainX, trainy)
          tr_acc, val_acc = clf.score(trainX, trainy), clf.score(validX, validy) 
          store_accs2.append((attr, try_reduce_depth, val_acc))
        pbar.update()
    optimal_num_feat, optimal_depth_red, val_acc_best= max(store_accs2, key = lambda x: x[2])
    print("\nValidation Accuracy:", val_acc_best, "For", optimal_num_feat, "max_features and", optimal_depth_red, "min_samples_leaf")

    #Combined train and validation dataset
    fin_trainX, fin_trainy = pd.concat([trainX, validX], axis = 0), pd.concat([trainy, validy], axis = 0)
    clf = DecisionTreeClassifier(random_state = 0, max_features=optimal_num_feat, min_samples_leaf=optimal_depth_red).fit(fin_trainX, fin_trainy)

    testX, testy = datasets[num_clauses][data_size]['test'].iloc[:,:-1], datasets[num_clauses][data_size]['test'].iloc[:,-1]
    ypred = clf.predict(testX)
    test_acc = clf.score(testX, testy) 
    precision, recall, f1, _ = precision_recall_fscore_support(testy, ypred, average = 'macro')
    print("\nOn test dataset:\n", "Accuracy:", test_acc, "\n", "Precision:", precision, "\n", "Recall:", recall, "\n", "F1:", f1, "\n")
    print("Total Time Taken:", time()-st,"\n")

c 300 d 100
Finding optimal parameters:


100%|██████████| 3/3 [00:01<00:00,  2.37it/s]



Validation Accuracy: 0.65 For None max_features and 12 min_samples_leaf

On test dataset:
 Accuracy: 0.63 
 Precision: 0.6302083333333333 
 Recall: 0.63 
 F1: 0.6298519407763106 

Total Time Taken: 1.3187751770019531 

c 300 d 1000
Finding optimal parameters:


100%|██████████| 3/3 [00:02<00:00,  1.25it/s]



Validation Accuracy: 0.6485 For None max_features and 80 min_samples_leaf

On test dataset:
 Accuracy: 0.647 
 Precision: 0.6480446027167694 
 Recall: 0.647 
 F1: 0.6463762076302597 

Total Time Taken: 2.601274251937866 

c 300 d 5000
Finding optimal parameters:


100%|██████████| 3/3 [00:08<00:00,  2.97s/it]



Validation Accuracy: 0.7179 For None max_features and 100 min_samples_leaf

On test dataset:
 Accuracy: 0.7404 
 Precision: 0.7481932276368817 
 Recall: 0.7404 
 F1: 0.7383460267763534 

Total Time Taken: 10.283243894577026 

c 500 d 100
Finding optimal parameters:


100%|██████████| 3/3 [00:01<00:00,  2.43it/s]



Validation Accuracy: 0.66 For log2 max_features and 6 min_samples_leaf

On test dataset:
 Accuracy: 0.575 
 Precision: 0.575007500750075 
 Recall: 0.575 
 F1: 0.5749893747343684 

Total Time Taken: 1.2714288234710693 

c 500 d 1000
Finding optimal parameters:


100%|██████████| 3/3 [00:02<00:00,  1.25it/s]



Validation Accuracy: 0.7055 For None max_features and 20 min_samples_leaf

On test dataset:
 Accuracy: 0.699 
 Precision: 0.7004141220451505 
 Recall: 0.6990000000000001 
 F1: 0.6984680977243858 

Total Time Taken: 2.664541482925415 

c 500 d 5000
Finding optimal parameters:


100%|██████████| 3/3 [00:09<00:00,  3.05s/it]



Validation Accuracy: 0.7284 For None max_features and 100 min_samples_leaf

On test dataset:
 Accuracy: 0.7454 
 Precision: 0.7455559378427676 
 Recall: 0.7454000000000001 
 F1: 0.7453595732858549 

Total Time Taken: 10.5201416015625 

c 1000 d 100
Finding optimal parameters:


100%|██████████| 3/3 [00:01<00:00,  2.49it/s]



Validation Accuracy: 0.765 For sqrt max_features and 2 min_samples_leaf

On test dataset:
 Accuracy: 0.725 
 Precision: 0.72886786695148 
 Recall: 0.7250000000000001 
 F1: 0.7238331952499311 

Total Time Taken: 1.2478928565979004 

c 1000 d 1000
Finding optimal parameters:


100%|██████████| 3/3 [00:02<00:00,  1.22it/s]



Validation Accuracy: 0.7975 For None max_features and 40 min_samples_leaf

On test dataset:
 Accuracy: 0.7905 
 Precision: 0.7954160179547856 
 Recall: 0.7905 
 F1: 0.7896247865181121 

Total Time Taken: 2.684218406677246 

c 1000 d 5000
Finding optimal parameters:


100%|██████████| 3/3 [00:09<00:00,  3.17s/it]



Validation Accuracy: 0.8064 For None max_features and 200 min_samples_leaf

On test dataset:
 Accuracy: 0.8162 
 Precision: 0.816548912873726 
 Recall: 0.8162 
 F1: 0.8161493381116101 

Total Time Taken: 10.63574767112732 

c 1500 d 100
Finding optimal parameters:


100%|██████████| 3/3 [00:01<00:00,  2.48it/s]



Validation Accuracy: 0.865 For None max_features and 12 min_samples_leaf

On test dataset:
 Accuracy: 0.86 
 Precision: 0.8613006824568447 
 Recall: 0.86 
 F1: 0.8598738864978481 

Total Time Taken: 1.2580153942108154 

c 1500 d 1000
Finding optimal parameters:


100%|██████████| 3/3 [00:02<00:00,  1.20it/s]



Validation Accuracy: 0.907 For None max_features and 20 min_samples_leaf

On test dataset:
 Accuracy: 0.929 
 Precision: 0.9290617848970252 
 Recall: 0.929 
 F1: 0.9289974439079808 

Total Time Taken: 2.7387776374816895 

c 1500 d 5000
Finding optimal parameters:


100%|██████████| 3/3 [00:09<00:00,  3.23s/it]



Validation Accuracy: 0.9171 For None max_features and 100 min_samples_leaf

On test dataset:
 Accuracy: 0.9394 
 Precision: 0.9394034449230082 
 Recall: 0.9394 
 F1: 0.9393998812237672 

Total Time Taken: 10.856064319610596 

c 1800 d 100
Finding optimal parameters:


100%|██████████| 3/3 [00:01<00:00,  2.50it/s]



Validation Accuracy: 0.98 For None max_features and 6 min_samples_leaf

On test dataset:
 Accuracy: 0.94 
 Precision: 0.9401760704281713 
 Recall: 0.94 
 F1: 0.93999399939994 

Total Time Taken: 1.248870849609375 

c 1800 d 1000
Finding optimal parameters:


100%|██████████| 3/3 [00:02<00:00,  1.24it/s]



Validation Accuracy: 0.962 For None max_features and 20 min_samples_leaf

On test dataset:
 Accuracy: 0.9595 
 Precision: 0.961105106877039 
 Recall: 0.9595 
 F1: 0.9594647241762143 

Total Time Taken: 2.6088056564331055 

c 1800 d 5000
Finding optimal parameters:


100%|██████████| 3/3 [00:09<00:00,  3.15s/it]



Validation Accuracy: 0.9694 For None max_features and 100 min_samples_leaf

On test dataset:
 Accuracy: 0.9681 
 Precision: 0.9685502768160201 
 Recall: 0.9681 
 F1: 0.9680923341832876 

Total Time Taken: 10.467654943466187 



# Bagging Classifier

In [122]:
for num_clauses in [300, 500, 1000, 1500, 1800]:
  for data_size in [100, 1000, 5000]:
    st = time()
    trainX, trainy = datasets[num_clauses][data_size]['train'].iloc[:,:-1], datasets[num_clauses][data_size]['train'].iloc[:,-1]
    validX, validy = datasets[num_clauses][data_size]['valid'].iloc[:,:-1], datasets[num_clauses][data_size]['valid'].iloc[:,-1] 
    
    print('===========================================================================')
    print('c',num_clauses,'d',data_size)
    print('===========================================================================')

    print("Finding optimal parameters:")
    #Optimizing on depth
    store_accs2 = []
    with tqdm(total=3, position=0, leave=True) as pbar:
      for name_try, attr in [('None', trainX.shape[1]),('sqrt', int(math.sqrt(trainX.shape[1]))), ('log2', int(math.log2(trainX.shape[1])))]:
        for try_estimator in range(15, 30  ,5):
          clf = BaggingClassifier(random_state = 0, estimator=DecisionTreeClassifier(), max_features=attr, n_estimators=try_estimator).fit(trainX, trainy)
          tr_acc, val_acc = clf.score(trainX, trainy), clf.score(validX, validy) 
          store_accs2.append((attr, try_estimator, val_acc))
        pbar.update()
    optimal_num_feat, optimal_est_n, val_acc_best= max(store_accs2, key = lambda x: x[2])
    print("\nValidation Accuracy:", val_acc_best, "For", optimal_num_feat, "max_features and", optimal_est_n, "n_estimators")

    #Combined train and validation dataset
    fin_trainX, fin_trainy = pd.concat([trainX, validX], axis = 0), pd.concat([trainy, validy], axis = 0)
    clf = BaggingClassifier(random_state = 0, estimator=DecisionTreeClassifier(), max_features=optimal_num_feat, n_estimators=optimal_est_n).fit(fin_trainX, fin_trainy)

    testX, testy = datasets[num_clauses][data_size]['test'].iloc[:,:-1], datasets[num_clauses][data_size]['test'].iloc[:,-1]
    ypred = clf.predict(testX)
    test_acc = clf.score(testX, testy) 
    precision, recall, f1, _ = precision_recall_fscore_support(testy, ypred, average = 'macro')
    print("\nOn test dataset:\n", "Accuracy:", test_acc, "\n", "Precision:", precision, "\n", "Recall:", recall, "\n", "F1:", f1, "\n")
    print("Total Time Taken:", time()-st,"\n")

c 300 d 100
Finding optimal parameters:


100%|██████████| 3/3 [00:02<00:00,  1.37it/s]



Validation Accuracy: 0.63 For 500 max_features and 15 n_estimators

On test dataset:
 Accuracy: 0.675 
 Precision: 0.675017501750175 
 Recall: 0.675 
 F1: 0.67499187479687 

Total Time Taken: 2.5724146366119385 

c 300 d 1000
Finding optimal parameters:


100%|██████████| 3/3 [00:06<00:00,  2.08s/it]



Validation Accuracy: 0.804 For 500 max_features and 25 n_estimators

On test dataset:
 Accuracy: 0.8225 
 Precision: 0.8226422852477943 
 Recall: 0.8225 
 F1: 0.8224804284672386 

Total Time Taken: 10.36277461051941 

c 300 d 5000
Finding optimal parameters:


100%|██████████| 3/3 [00:38<00:00, 12.77s/it]



Validation Accuracy: 0.8726 For 500 max_features and 25 n_estimators

On test dataset:
 Accuracy: 0.8972 
 Precision: 0.9005313958531247 
 Recall: 0.8972 
 F1: 0.8969857963855723 

Total Time Taken: 69.6938943862915 

c 500 d 100
Finding optimal parameters:


100%|██████████| 3/3 [00:00<00:00,  3.53it/s]



Validation Accuracy: 0.765 For 500 max_features and 25 n_estimators

On test dataset:
 Accuracy: 0.795 
 Precision: 0.7957393483709273 
 Recall: 0.7949999999999999 
 F1: 0.7948717948717949 

Total Time Taken: 1.2179772853851318 

c 500 d 1000
Finding optimal parameters:


100%|██████████| 3/3 [00:06<00:00,  2.24s/it]



Validation Accuracy: 0.839 For 500 max_features and 25 n_estimators

On test dataset:
 Accuracy: 0.8615 
 Precision: 0.8618477356739827 
 Recall: 0.8614999999999999 
 F1: 0.8614667173788503 

Total Time Taken: 10.801147937774658 

c 500 d 5000
Finding optimal parameters:


100%|██████████| 3/3 [00:36<00:00, 12.31s/it]



Validation Accuracy: 0.8846 For 500 max_features and 25 n_estimators

On test dataset:
 Accuracy: 0.9147 
 Precision: 0.9151521006375943 
 Recall: 0.9147000000000001 
 F1: 0.914676770750837 

Total Time Taken: 68.90495777130127 

c 1000 d 100
Finding optimal parameters:


100%|██████████| 3/3 [00:00<00:00,  3.64it/s]



Validation Accuracy: 0.84 For 500 max_features and 20 n_estimators

On test dataset:
 Accuracy: 0.875 
 Precision: 0.8759398496240601 
 Recall: 0.875 
 F1: 0.8749218261413383 

Total Time Taken: 1.1150665283203125 

c 1000 d 1000
Finding optimal parameters:


100%|██████████| 3/3 [00:04<00:00,  1.62s/it]



Validation Accuracy: 0.917 For 500 max_features and 25 n_estimators

On test dataset:
 Accuracy: 0.9245 
 Precision: 0.925214786055359 
 Recall: 0.9245 
 F1: 0.9244682577853343 

Total Time Taken: 9.853615760803223 

c 1000 d 5000
Finding optimal parameters:


100%|██████████| 3/3 [00:32<00:00, 10.75s/it]



Validation Accuracy: 0.9452 For 8 max_features and 25 n_estimators

On test dataset:
 Accuracy: 0.955 
 Precision: 0.9550612330395178 
 Recall: 0.9550000000000001 
 F1: 0.954998486149074 

Total Time Taken: 32.729610443115234 

c 1500 d 100
Finding optimal parameters:


100%|██████████| 3/3 [00:00<00:00,  3.77it/s]



Validation Accuracy: 0.975 For 22 max_features and 25 n_estimators

On test dataset:
 Accuracy: 0.985 
 Precision: 0.9850485048504851 
 Recall: 0.985 
 F1: 0.9849996249906248 

Total Time Taken: 0.9037103652954102 

c 1500 d 1000
Finding optimal parameters:


100%|██████████| 3/3 [00:04<00:00,  1.64s/it]



Validation Accuracy: 0.9875 For 8 max_features and 25 n_estimators

On test dataset:
 Accuracy: 0.9925 
 Precision: 0.9925123128078202 
 Recall: 0.9924999999999999 
 F1: 0.9924999531247072 

Total Time Taken: 5.218010663986206 

c 1500 d 5000
Finding optimal parameters:


100%|██████████| 3/3 [00:27<00:00,  9.25s/it]



Validation Accuracy: 0.9963 For 8 max_features and 25 n_estimators

On test dataset:
 Accuracy: 0.9977 
 Precision: 0.997703364474744 
 Recall: 0.9977 
 F1: 0.9976999961129934 

Total Time Taken: 28.225281715393066 

c 1800 d 100
Finding optimal parameters:


100%|██████████| 3/3 [00:00<00:00,  3.82it/s]



Validation Accuracy: 0.995 For 500 max_features and 20 n_estimators

On test dataset:
 Accuracy: 0.97 
 Precision: 0.970188075230092 
 Recall: 0.97 
 F1: 0.96999699969997 

Total Time Taken: 1.0107195377349854 

c 1800 d 1000
Finding optimal parameters:


100%|██████████| 3/3 [00:05<00:00,  1.69s/it]



Validation Accuracy: 1.0 For 22 max_features and 25 n_estimators

On test dataset:
 Accuracy: 1.0 
 Precision: 1.0 
 Recall: 1.0 
 F1: 1.0 

Total Time Taken: 5.349790096282959 

c 1800 d 5000
Finding optimal parameters:


100%|██████████| 3/3 [00:21<00:00,  7.27s/it]



Validation Accuracy: 0.9998 For 8 max_features and 25 n_estimators

On test dataset:
 Accuracy: 0.9997 
 Precision: 0.9997000199880008 
 Recall: 0.9997 
 F1: 0.999699999997 

Total Time Taken: 22.264930248260498 



# Random Forest Classifier

In [94]:
for num_clauses in [300, 500, 1000, 1500, 1800]:
  for data_size in [100, 1000, 5000]:
    st = time()
    trainX, trainy = datasets[num_clauses][data_size]['train'].iloc[:,:-1], datasets[num_clauses][data_size]['train'].iloc[:,-1]
    validX, validy = datasets[num_clauses][data_size]['valid'].iloc[:,:-1], datasets[num_clauses][data_size]['valid'].iloc[:,-1]
    clf = RandomForestClassifier(max_features=trainX.shape[1], min_samples_leaf=1).fit(trainX, trainy)
    tr_acc, val_acc = clf.score(trainX, trainy), clf.score(validX, validy) 
    
    print('===========================================================================')
    print('c',num_clauses,'d',data_size)
    print('===========================================================================')

    print("Finding optimal parameters:")
    store_accs2 = []
    with tqdm(total=3, position=0, leave=True) as pbar:
      for attr in ['sqrt', 'log2', None]:
        for try_reduce_depth in range(data_size//50, int(data_size*val_acc_best),data_size//50):
          clf = RandomForestClassifier(random_state = 0, max_features=attr, min_samples_leaf=try_reduce_depth).fit(trainX, trainy)
          tr_acc, val_acc = clf.score(trainX, trainy), clf.score(validX, validy) 
          store_accs2.append((attr, try_reduce_depth, val_acc))
        pbar.update()
    optimal_num_feat, optimal_depth_red, val_acc_best= max(store_accs2, key = lambda x: x[2])
    print("\nValidation Accuracy:", val_acc_best, "For", optimal_num_feat, "max_features and", optimal_depth_red, "min_samples_leaf")

    #Combined train and validation dataset
    fin_trainX, fin_trainy = pd.concat([trainX, validX], axis = 0), pd.concat([trainy, validy], axis = 0)
    clf = RandomForestClassifier(random_state = 0, max_features=optimal_num_feat, min_samples_leaf=optimal_depth_red).fit(fin_trainX, fin_trainy)

    testX, testy = datasets[num_clauses][data_size]['test'].iloc[:,:-1], datasets[num_clauses][data_size]['test'].iloc[:,-1]
    ypred = clf.predict(testX)
    test_acc = clf.score(testX, testy) 
    precision, recall, f1, _ = precision_recall_fscore_support(testy, ypred, average = 'macro')
    print("\nOn test dataset:\n", "Accuracy:", test_acc, "\n", "Precision:", precision, "\n", "Recall:", recall, "\n", "F1:", f1, "\n")
    print("Total Time Taken:", time()-st,"\n")

c 300 d 100
Finding optimal parameters:


100%|██████████| 3/3 [00:36<00:00, 12.20s/it]



Validation Accuracy: 0.73 For sqrt max_features and 10 min_samples_leaf

On test dataset:
 Accuracy: 0.77 
 Precision: 0.7717391304347826 
 Recall: 0.77 
 F1: 0.7696314102564102 

Total Time Taken: 37.854525566101074 

c 300 d 1000
Finding optimal parameters:


100%|██████████| 3/3 [01:49<00:00, 36.44s/it]



Validation Accuracy: 0.8705 For sqrt max_features and 20 min_samples_leaf

On test dataset:
 Accuracy: 0.8725 
 Precision: 0.8725033525301727 
 Recall: 0.8725 
 F1: 0.8724997131243546 

Total Time Taken: 119.38709926605225 

c 300 d 5000
Finding optimal parameters:


100%|██████████| 3/3 [05:59<00:00, 119.94s/it]



Validation Accuracy: 0.896 For log2 max_features and 100 min_samples_leaf

On test dataset:
 Accuracy: 0.8956 
 Precision: 0.8962213384541102 
 Recall: 0.8956 
 F1: 0.8955590549719112 

Total Time Taken: 445.3855035305023 

c 500 d 100
Finding optimal parameters:


100%|██████████| 3/3 [00:33<00:00, 11.03s/it]



Validation Accuracy: 0.89 For log2 max_features and 18 min_samples_leaf

On test dataset:
 Accuracy: 0.905 
 Precision: 0.905040504050405 
 Recall: 0.905 
 F1: 0.9049976249406235 

Total Time Taken: 34.2866313457489 

c 500 d 1000
Finding optimal parameters:


100%|██████████| 3/3 [01:31<00:00, 30.39s/it]



Validation Accuracy: 0.9305 For log2 max_features and 20 min_samples_leaf

On test dataset:
 Accuracy: 0.948 
 Precision: 0.9483515075819442 
 Recall: 0.948 
 F1: 0.9479898060019765 

Total Time Taken: 99.51574730873108 

c 500 d 5000
Finding optimal parameters:


100%|██████████| 3/3 [06:05<00:00, 121.99s/it]



Validation Accuracy: 0.943 For log2 max_features and 100 min_samples_leaf

On test dataset:
 Accuracy: 0.9534 
 Precision: 0.9535535550916119 
 Recall: 0.9534 
 F1: 0.9533960554421327 

Total Time Taken: 420.9317708015442 

c 1000 d 100
Finding optimal parameters:


100%|██████████| 3/3 [00:35<00:00, 11.86s/it]



Validation Accuracy: 0.995 For log2 max_features and 12 min_samples_leaf

On test dataset:
 Accuracy: 0.985 
 Precision: 0.9854368932038835 
 Recall: 0.985 
 F1: 0.9849966242404541 

Total Time Taken: 36.3954017162323 

c 1000 d 1000
Finding optimal parameters:


100%|██████████| 3/3 [01:36<00:00, 32.18s/it]



Validation Accuracy: 0.996 For log2 max_features and 20 min_samples_leaf

On test dataset:
 Accuracy: 0.993 
 Precision: 0.993001972007888 
 Recall: 0.993 
 F1: 0.992999992999993 

Total Time Taken: 103.27438879013062 

c 1000 d 5000
Finding optimal parameters:


100%|██████████| 3/3 [06:20<00:00, 126.75s/it]



Validation Accuracy: 0.994 For log2 max_features and 100 min_samples_leaf

On test dataset:
 Accuracy: 0.9946 
 Precision: 0.9946003165442026 
 Recall: 0.9945999999999999 
 F1: 0.994599999136 

Total Time Taken: 428.60812044143677 

c 1500 d 100
Finding optimal parameters:


100%|██████████| 3/3 [00:36<00:00, 12.31s/it]



Validation Accuracy: 1.0 For sqrt max_features and 2 min_samples_leaf

On test dataset:
 Accuracy: 1.0 
 Precision: 1.0 
 Recall: 1.0 
 F1: 1.0 

Total Time Taken: 37.72492337226868 

c 1500 d 1000
Finding optimal parameters:


100%|██████████| 3/3 [01:36<00:00, 32.13s/it]



Validation Accuracy: 1.0 For sqrt max_features and 20 min_samples_leaf

On test dataset:
 Accuracy: 0.9985 
 Precision: 0.9985044865403789 
 Recall: 0.9984999999999999 
 F1: 0.9984999966249923 

Total Time Taken: 102.27635312080383 

c 1500 d 5000
Finding optimal parameters:


100%|██████████| 3/3 [06:30<00:00, 130.16s/it]



Validation Accuracy: 0.9997 For log2 max_features and 100 min_samples_leaf

On test dataset:
 Accuracy: 0.9996 
 Precision: 0.9996003197442047 
 Recall: 0.9996 
 F1: 0.9995999999359999 

Total Time Taken: 429.95472717285156 

c 1800 d 100
Finding optimal parameters:


100%|██████████| 3/3 [00:47<00:00, 15.95s/it]



Validation Accuracy: 1.0 For sqrt max_features and 2 min_samples_leaf

On test dataset:
 Accuracy: 1.0 
 Precision: 1.0 
 Recall: 1.0 
 F1: 1.0 

Total Time Taken: 49.158920764923096 

c 1800 d 1000
Finding optimal parameters:


100%|██████████| 3/3 [01:36<00:00, 32.12s/it]



Validation Accuracy: 1.0 For sqrt max_features and 20 min_samples_leaf

On test dataset:
 Accuracy: 1.0 
 Precision: 1.0 
 Recall: 1.0 
 F1: 1.0 

Total Time Taken: 102.43275570869446 

c 1800 d 5000
Finding optimal parameters:


100%|██████████| 3/3 [06:15<00:00, 125.22s/it]



Validation Accuracy: 1.0 For sqrt max_features and 1000 min_samples_leaf

On test dataset:
 Accuracy: 0.9999 
 Precision: 0.9999000199960009 
 Recall: 0.9999 
 F1: 0.9998999999989999 

Total Time Taken: 407.1577808856964 



# Gradient Boosting Classifier

In [99]:
for num_clauses in [300, 500, 1000, 1500, 1800]:
  for data_size in [100, 1000, 5000]:
    st = time()
    trainX, trainy = datasets[num_clauses][data_size]['train'].iloc[:,:-1], datasets[num_clauses][data_size]['train'].iloc[:,-1]
    validX, validy = datasets[num_clauses][data_size]['valid'].iloc[:,:-1], datasets[num_clauses][data_size]['valid'].iloc[:,-1]
    
    print('===========================================================================')
    print('c',num_clauses,'d',data_size)
    print('===========================================================================')

    print("Finding optimal n_estimators:")
    #Optimizing on depth
    store_accs2 = []
    with tqdm(total=3, position=0, leave=True) as pbar:
      for attr in ['sqrt', 'log2', None]:
        for try_estimator in range(15, 45, 5):
          clf = GradientBoostingClassifier(random_state = 0, max_features=attr, n_estimators=try_estimator).fit(trainX, trainy)
          tr_acc, val_acc = clf.score(trainX, trainy), clf.score(validX, validy) 
          store_accs2.append((attr, try_estimator, val_acc))
        pbar.update()
    optimal_num_feat, optimal_est_n, val_acc_best= max(store_accs2, key = lambda x: x[2])

    print("Finding optimal learning rate:")
    store_accs3 = [(0.1, val_acc_best)]
    with tqdm(total=3, position=0, leave=True) as pbar:
      for lr in [0.01, 0.5, 1]:
        clf = GradientBoostingClassifier(random_state = 0, max_features=optimal_num_feat, n_estimators=optimal_est_n, learning_rate = lr).fit(trainX, trainy)
        tr_acc, val_acc = clf.score(trainX, trainy), clf.score(validX, validy) 
        store_accs3.append((lr,val_acc))
        pbar.update()
    opt_lr, val_acc_best= max(store_accs3, key = lambda x: x[1])
    print("\nValidation Accuracy:", val_acc_best, "For", optimal_num_feat, "max_features,", optimal_est_n, "n_estimators and", opt_lr, "learning rate.")

    #Combined train and validation dataset
    fin_trainX, fin_trainy = pd.concat([trainX, validX], axis = 0), pd.concat([trainy, validy], axis = 0)
    clf = GradientBoostingClassifier(random_state = 0, max_features=optimal_num_feat, n_estimators=optimal_est_n, learning_rate = opt_lr).fit(fin_trainX, fin_trainy)

    testX, testy = datasets[num_clauses][data_size]['test'].iloc[:,:-1], datasets[num_clauses][data_size]['test'].iloc[:,-1]
    ypred = clf.predict(testX)
    test_acc = clf.score(testX, testy) 
    precision, recall, f1, _ = precision_recall_fscore_support(testy, ypred, average = 'macro')
    print("\nOn test dataset:\n", "Accuracy:", test_acc, "\n", "Precision:", precision, "\n", "Recall:", recall, "\n", "F1:", f1, "\n")
    print("Total Time Taken:", time()-st,"\n")

c 300 d 100
Finding optimal n_estimators:


100%|██████████| 3/3 [00:03<00:00,  1.13s/it]


Finding optimal learning rate:


100%|██████████| 3/3 [00:01<00:00,  1.93it/s]



Validation Accuracy: 0.715 For None max_features, 30 n_estimators and 0.1 learning rate.

On test dataset:
 Accuracy: 0.83 
 Precision: 0.8311922922521076 
 Recall: 0.8300000000000001 
 F1: 0.8298468621759585 

Total Time Taken: 5.566346645355225 

c 300 d 1000
Finding optimal n_estimators:


100%|██████████| 3/3 [00:07<00:00,  2.56s/it]


Finding optimal learning rate:


100%|██████████| 3/3 [00:05<00:00,  1.84s/it]



Validation Accuracy: 0.934 For None max_features, 40 n_estimators and 0.5 learning rate.

On test dataset:
 Accuracy: 0.966 
 Precision: 0.9673628300123159 
 Recall: 0.966 
 F1: 0.965975195917824 

Total Time Taken: 16.76596760749817 

c 300 d 5000
Finding optimal n_estimators:


100%|██████████| 3/3 [00:39<00:00, 13.15s/it]


Finding optimal learning rate:


100%|██████████| 3/3 [00:24<00:00,  8.21s/it]



Validation Accuracy: 0.9837 For None max_features, 40 n_estimators and 1 learning rate.

On test dataset:
 Accuracy: 0.9864 
 Precision: 0.9864899811869203 
 Recall: 0.9863999999999999 
 F1: 0.98639937110692 

Total Time Taken: 83.8907310962677 

c 500 d 100
Finding optimal n_estimators:


100%|██████████| 3/3 [00:01<00:00,  2.12it/s]


Finding optimal learning rate:


100%|██████████| 3/3 [00:00<00:00, 14.76it/s]



Validation Accuracy: 0.815 For sqrt max_features, 40 n_estimators and 0.5 learning rate.

On test dataset:
 Accuracy: 0.865 
 Precision: 0.8650365036503651 
 Recall: 0.865 
 F1: 0.8649966249156229 

Total Time Taken: 1.7182738780975342 

c 500 d 1000
Finding optimal n_estimators:


100%|██████████| 3/3 [00:07<00:00,  2.48s/it]


Finding optimal learning rate:


100%|██████████| 3/3 [00:05<00:00,  1.99s/it]



Validation Accuracy: 0.959 For None max_features, 40 n_estimators and 0.5 learning rate.

On test dataset:
 Accuracy: 0.9865 
 Precision: 0.9865394096921851 
 Recall: 0.9864999999999999 
 F1: 0.986499726619464 

Total Time Taken: 16.507256746292114 

c 500 d 5000
Finding optimal n_estimators:


100%|██████████| 3/3 [00:39<00:00, 13.14s/it]


Finding optimal learning rate:


100%|██████████| 3/3 [00:24<00:00,  8.22s/it]



Validation Accuracy: 0.9858 For None max_features, 40 n_estimators and 1 learning rate.

On test dataset:
 Accuracy: 0.9896 
 Precision: 0.9896571136057309 
 Recall: 0.9896 
 F1: 0.9895996967271565 

Total Time Taken: 81.48451209068298 

c 1000 d 100
Finding optimal n_estimators:


100%|██████████| 3/3 [00:01<00:00,  2.19it/s]


Finding optimal learning rate:


100%|██████████| 3/3 [00:00<00:00, 18.74it/s]



Validation Accuracy: 0.965 For log2 max_features, 40 n_estimators and 0.5 learning rate.

On test dataset:
 Accuracy: 0.955 
 Precision: 0.955045504550455 
 Recall: 0.955 
 F1: 0.9549988749718743 

Total Time Taken: 1.6446611881256104 

c 1000 d 1000
Finding optimal n_estimators:


100%|██████████| 3/3 [00:07<00:00,  2.58s/it]


Finding optimal learning rate:


100%|██████████| 3/3 [00:00<00:00,  6.12it/s]



Validation Accuracy: 0.9835 For log2 max_features, 40 n_estimators and 0.5 learning rate.

On test dataset:
 Accuracy: 0.98 
 Precision: 0.98 
 Recall: 0.98 
 F1: 0.98 

Total Time Taken: 8.522085666656494 

c 1000 d 5000
Finding optimal n_estimators:


100%|██████████| 3/3 [00:38<00:00, 12.76s/it]


Finding optimal learning rate:


100%|██████████| 3/3 [00:26<00:00,  8.85s/it]



Validation Accuracy: 0.9977 For None max_features, 40 n_estimators and 0.5 learning rate.

On test dataset:
 Accuracy: 0.998 
 Precision: 0.9980012748832636 
 Recall: 0.998 
 F1: 0.9979999987199991 

Total Time Taken: 81.11873149871826 

c 1500 d 100
Finding optimal n_estimators:


100%|██████████| 3/3 [00:01<00:00,  2.19it/s]


Finding optimal learning rate:


100%|██████████| 3/3 [00:00<00:00, 22.95it/s]



Validation Accuracy: 1.0 For log2 max_features, 25 n_estimators and 0.1 learning rate.

On test dataset:
 Accuracy: 0.98 
 Precision: 0.9801920768307323 
 Recall: 0.98 
 F1: 0.9799979997999799 

Total Time Taken: 1.5717449188232422 

c 1500 d 1000
Finding optimal n_estimators:


100%|██████████| 3/3 [00:08<00:00,  2.80s/it]


Finding optimal learning rate:


100%|██████████| 3/3 [00:00<00:00, 10.60it/s]



Validation Accuracy: 0.9995 For log2 max_features, 35 n_estimators and 0.5 learning rate.

On test dataset:
 Accuracy: 0.9995 
 Precision: 0.9995004995004995 
 Recall: 0.9995 
 F1: 0.999499999875 

Total Time Taken: 8.861199855804443 

c 1500 d 5000
Finding optimal n_estimators:


100%|██████████| 3/3 [00:36<00:00, 12.08s/it]


Finding optimal learning rate:


100%|██████████| 3/3 [00:01<00:00,  1.81it/s]



Validation Accuracy: 0.9998 For sqrt max_features, 40 n_estimators and 0.5 learning rate.

On test dataset:
 Accuracy: 0.9994 
 Precision: 0.9994003196162045 
 Recall: 0.9994000000000001 
 F1: 0.999399999904 

Total Time Taken: 39.33448839187622 

c 1800 d 100
Finding optimal n_estimators:


100%|██████████| 3/3 [00:02<00:00,  1.44it/s]


Finding optimal learning rate:


100%|██████████| 3/3 [00:00<00:00, 14.25it/s]



Validation Accuracy: 1.0 For sqrt max_features, 25 n_estimators and 0.1 learning rate.

On test dataset:
 Accuracy: 0.995 
 Precision: 0.995049504950495 
 Recall: 0.995 
 F1: 0.9949998749968749 

Total Time Taken: 2.401334524154663 

c 1800 d 1000
Finding optimal n_estimators:


100%|██████████| 3/3 [00:06<00:00,  2.33s/it]


Finding optimal learning rate:


100%|██████████| 3/3 [00:00<00:00, 11.85it/s]



Validation Accuracy: 1.0 For sqrt max_features, 20 n_estimators and 0.1 learning rate.

On test dataset:
 Accuracy: 0.998 
 Precision: 0.99800796812749 
 Recall: 0.998 
 F1: 0.997999991999968 

Total Time Taken: 7.411762475967407 

c 1800 d 5000
Finding optimal n_estimators:


100%|██████████| 3/3 [00:36<00:00, 12.03s/it]


Finding optimal learning rate:


100%|██████████| 3/3 [00:00<00:00,  3.12it/s]



Validation Accuracy: 1.0 For log2 max_features, 40 n_estimators and 0.5 learning rate.

On test dataset:
 Accuracy: 1.0 
 Precision: 1.0 
 Recall: 1.0 
 F1: 1.0 

Total Time Taken: 37.634262800216675 



# Downloading MNIST dataset.

In [102]:
from sklearn.datasets import fetch_openml
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
X = X / 255.
X_train, X_test = X[:60000], X[60000:]
y_train, y_test = y[:60000], y[60000:]
data_size = 60000
trainX, validX, trainy, validy = train_test_split(X_train, y_train, test_size=0.33, random_state=0)

  warn(


# Using Decision Tree Classifier on MNIST

In [108]:
print("Finding optimal parameters:")
#Optimizing on depth
store_accs2 = []
with tqdm(total=3, position=0, leave=True) as pbar:
  for attr in ['sqrt', 'log2', None]:
    for try_reduce_depth in [1] + list(range(data_size//50, data_size//2,data_size//50)):
      clf = DecisionTreeClassifier(random_state = 0, max_features=attr, min_samples_leaf=try_reduce_depth).fit(trainX, trainy)
      tr_acc, val_acc = clf.score(trainX, trainy), clf.score(validX, validy) 
      store_accs2.append((attr, try_reduce_depth, val_acc))
    pbar.update()
optimal_num_feat, optimal_depth_red, val_acc_best= max(store_accs2, key = lambda x: x[2])
print("\nValidation Accuracy:", val_acc_best, "For", optimal_num_feat, "max_features and", optimal_depth_red, "min_samples_leaf")

#Combined train and validation dataset
clf = DecisionTreeClassifier(random_state = 0, max_features=optimal_num_feat, min_samples_leaf=optimal_depth_red).fit(X_train, y_train)
ypred = clf.predict(X_test)
test_acc = clf.score(X_test, y_test) 
precision, recall, f1, _ = precision_recall_fscore_support(y_test, ypred, average = 'macro')
print("\nOn test dataset:\n", "Accuracy:", test_acc, "\n", "Precision:", precision, "\n", "Recall:", recall, "\n", "F1:", f1, "\n")
print("Total Time Taken:", time()-st,"\n")

Finding optimal parameters:


100%|██████████| 3/3 [01:23<00:00, 27.86s/it]



Validation Accuracy: 0.8625757575757576 For None max_features and 1 min_samples_leaf

On test dataset:
 Accuracy: 0.8779 
 Precision: 0.8766071633824115 
 Recall: 0.8763956057044318 
 F1: 0.8764276413789659 

Total Time Taken: 1464.451583623886 



# Using Bagging Classifier on MNIST

In [119]:
print("Finding optimal parameters:")
#Optimizing on depth
store_accs2 = []
with tqdm(total=2, position=0, leave=True) as pbar:
  for name_try, attr in [('sqrt', int(math.sqrt(trainX.shape[1]))), ('log2', int(math.log2(trainX.shape[1])))]:
    for try_estimator in range(10, 25, 5):
      clf = BaggingClassifier(random_state = 0, estimator=DecisionTreeClassifier(), max_features=attr, n_estimators=try_estimator).fit(trainX, trainy)
      tr_acc, val_acc = clf.score(trainX, trainy), clf.score(validX, validy) 
      store_accs2.append((try_max_features, try_estimator, val_acc))
    pbar.update()
optimal_num_feat, optimal_est_n, val_acc_best= max(store_accs2, key = lambda x: x[2])
print("\nValidation Accuracy:", val_acc_best, "For", optimal_num_feat, "max_features and", optimal_est_n, "n_estimators")

#Combined train and validation dataset
clf = BaggingClassifier(random_state = 0, estimator=DecisionTreeClassifier(), max_features=optimal_num_feat, n_estimators=optimal_est_n).fit(X_train, y_train)

ypred = clf.predict(X_test)
test_acc = clf.score(X_test, y_test) 
precision, recall, f1, _ = precision_recall_fscore_support(y_test, ypred, average = 'macro')
print("\nOn test dataset:\n", "Accuracy:", test_acc, "\n", "Precision:", precision, "\n", "Recall:", recall, "\n", "F1:", f1, "\n")
print("Total Time Taken:", time()-st,"\n")

Finding optimal parameters:


100%|██████████| 2/2 [00:19<00:00,  9.58s/it]



Validation Accuracy: 0.8997474747474747 For 499 max_features and 20 n_estimators

On test dataset:
 Accuracy: 0.9582 
 Precision: 0.9579025278375541 
 Recall: 0.9577075481220423 
 F1: 0.9577531082748332 

Total Time Taken: 3540.0547671318054 



# Using Random Forest Classifier on MNIST

In [115]:
print("Finding optimal parameters:")
store_accs2 = []
with tqdm(total=2, position=0, leave=True) as pbar:
  for attr in ['sqrt', 'log2']:
    for try_reduce_depth in [1]:
      clf = RandomForestClassifier(random_state = 0, max_features=attr, min_samples_leaf=try_reduce_depth).fit(trainX, trainy)
      tr_acc, val_acc = clf.score(trainX, trainy), clf.score(validX, validy) 
      store_accs2.append((attr, try_reduce_depth, val_acc))
    pbar.update()
optimal_num_feat, optimal_depth_red, val_acc_best= max(store_accs2, key = lambda x: x[2])
print("\nValidation Accuracy:", val_acc_best, "For", optimal_num_feat, "max_features and", optimal_depth_red, "min_samples_leaf")

#Combined train and validation dataset
# fin_trainX, fin_trainy = pd.concat([trainX, validX], axis = 0), pd.concat([trainy, validy], axis = 0)
clf = RandomForestClassifier(random_state = 0, max_features=optimal_num_feat, min_samples_leaf=optimal_depth_red).fit(X_train, y_train)

ypred = clf.predict(X_test)
test_acc = clf.score(X_test, y_test) 
precision, recall, f1, _ = precision_recall_fscore_support(y_test, ypred, average = 'macro')
print("\nOn test dataset:\n", "Accuracy:", test_acc, "\n", "Precision:", precision, "\n", "Recall:", recall, "\n", "F1:", f1, "\n")
print("Total Time Taken:", time()-st,"\n")

Finding optimal parameters:


100%|██████████| 2/2 [01:10<00:00, 35.31s/it]



Validation Accuracy: 0.9675757575757575 For sqrt max_features and 1 min_samples_leaf

On test dataset:
 Accuracy: 0.9705 
 Precision: 0.9702292282943962 
 Recall: 0.9702805962559312 
 F1: 0.9702351062124602 

Total Time Taken: 3090.2914838790894 



# Using Gradient Boosting Classifier on MNIST

In [120]:
print("Finding optimal n_estimators:")
#Optimizing on depth
store_accs2 = []
with tqdm(total=2, position=0, leave=True) as pbar:
  for attr in ['sqrt', 'log2']:
    for try_estimator in range(10, 25, 5):
      clf = GradientBoostingClassifier(random_state = 0, max_features=attr, n_estimators=try_estimator).fit(trainX, trainy)
      tr_acc, val_acc = clf.score(trainX, trainy), clf.score(validX, validy) 
      store_accs2.append((attr, try_estimator, val_acc))
    pbar.update()
optimal_num_feat, optimal_est_n, val_acc_best= max(store_accs2, key = lambda x: x[2])

print("Finding optimal learning rate:")
store_accs3 = [(0.1, val_acc_best)]
with tqdm(total=3, position=0, leave=True) as pbar:
  for lr in [0.01, 0.5, 1]:
    clf = GradientBoostingClassifier(random_state = 0, max_features=optimal_num_feat, n_estimators=optimal_est_n, learning_rate = lr).fit(trainX, trainy)
    tr_acc, val_acc = clf.score(trainX, trainy), clf.score(validX, validy) 
    store_accs3.append((lr,val_acc))
    pbar.update()
opt_lr, val_acc_best= max(store_accs3, key = lambda x: x[1])
print("\nValidation Accuracy:", val_acc_best, "For", optimal_num_feat, "max_features,", optimal_est_n, "n_estimators and", opt_lr, "learning rate.")

#Combined train and validation dataset
clf = GradientBoostingClassifier(random_state = 0, max_features=optimal_num_feat, n_estimators=optimal_est_n, learning_rate = opt_lr).fit(X_train, y_train)

ypred = clf.predict(X_test)
test_acc = clf.score(X_test, y_test) 
precision, recall, f1, _ = precision_recall_fscore_support(y_test, ypred, average = 'macro')
print("\nOn test dataset:\n", "Accuracy:", test_acc, "\n", "Precision:", precision, "\n", "Recall:", recall, "\n", "F1:", f1, "\n")
print("Total Time Taken:", time()-st,"\n")

Finding optimal n_estimators:


100%|██████████| 2/2 [01:32<00:00, 46.36s/it]


Finding optimal learning rate:


100%|██████████| 3/3 [00:55<00:00, 18.44s/it]



Validation Accuracy: 0.9087373737373737 For sqrt max_features, 20 n_estimators and 0.5 learning rate.

On test dataset:
 Accuracy: 0.9143 
 Precision: 0.9136284397631893 
 Recall: 0.9130241144917202 
 F1: 0.9131369254092189 

Total Time Taken: 3766.884696483612 

