In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
!pip install keras_tuner
import numpy as np
import pandas as pd
import torch
import random
import ast
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import keras_tuner as kt
import os
import copy

from collections import defaultdict
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from torch.utils.data import Dataset, DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Mounted at /content/gdrive
Collecting keras_tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras_tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras_tuner
Successfully installed keras_tuner-1.4.7 kt-legacy-1.0.5


In [None]:
def evaluate_predictions(predictions, test_labels, test_times, test_cells):
    total_time = 0.0
    total_cells = 0

    # Convert string representations to actual lists
    test_times_lists = test_times.apply(lambda x: ast.literal_eval(x))
    test_cells_lists = test_cells.apply(lambda x: ast.literal_eval(x))

    valid_predictions = 0

    for i, pred in enumerate(predictions):
        label = int(pred)
        if label < len(test_times_lists.iloc[i]) and label < len(test_cells_lists.iloc[i]):
            total_time += test_times_lists.iloc[i][label]
            total_cells += test_cells_lists.iloc[i][label]
            valid_predictions += 1

    avg_time = total_time / valid_predictions if valid_predictions > 0 else 0
    avg_cells = total_cells / valid_predictions if valid_predictions > 0 else 0
    print(avg_time, avg_cells)
    return avg_time, avg_cells

In [None]:
def run_lr(X_train, y_train,X_test,y_test,evaluation=False):
  #print("Logistic Regression")
  lr = LogisticRegression(max_iter=5000, multi_class='multinomial')
  lr.fit(X_train, y_train)
  lr_y_pred = lr.predict(X_test)

  lr_accuracy_score=accuracy_score(y_test, lr_y_pred)
  print(f'LR accuracy: {lr_accuracy_score*100:.2f}%')
  #print(classification_report(y_test, lr_y_pred))
  if evaluation==True:
    total_time, total_cells = evaluate_predictions(lr_y_pred, y_test, test_times, test_cells)
  return lr,lr_y_pred

In [None]:
def run_gbm(X_train, y_train,X_test,y_test,evaluation=False):
  #print("Gradient Boosting Machine")
  gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
  gbm.fit(X_train, y_train)
  gbm_y_pred = gbm.predict(X_test)

  #print(classification_report(y_test, gbm_y_pred))
  gbm_accuracy_score=accuracy_score(y_test, gbm_y_pred)
  print(f'GBM accuracy: {gbm_accuracy_score*100:.2f}%')
  if evaluation==True:
    total_time, total_cells = evaluate_predictions(gbm_y_pred, y_test, test_times, test_cells)
  return gbm,gbm_y_pred

In [None]:
def run_svm(X_train, y_train,X_test,y_test,evaluation=False):
  svm = SVC(kernel='linear', C=1, probability=True, random_state=42)
  svm.fit(X_train, y_train)
  svm_y_pred = svm.predict(X_test)

  svm_accuracy_score=accuracy_score(y_test, svm_y_pred)
  print(f'SVM accuracy: {svm_accuracy_score*100:.2f}%')
  #print(classification_report(y_test, svm_y_pred))
  if evaluation==True:
    total_time, total_cells = evaluate_predictions(svm_y_pred, y_test, test_times, test_cells)
  return svm,svm_y_pred

In [None]:
def run_knn(X_train, y_train,X_test,y_test,evaluation=False):
  knn = KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='ball_tree')
  knn.fit(X_train, y_train)
  #print("K-Nearest-Neighbour")
  knn_y_pred = knn.predict(X_test)

  knn_accuracy_score = accuracy_score(y_test, knn_y_pred)
  print(f'KNN accuracy: {knn_accuracy_score*100:.2f}%')
  #print(classification_report(y_test, knn_y_pred))
  if evaluation==True:
    total_time, total_cells = evaluate_predictions(knn_y_pred, y_test, test_times, test_cells)
  return knn,knn_y_pred

In [None]:

def run_dt(X_train, y_train,X_test,y_test,evaluation=False):
  dt = DecisionTreeClassifier(criterion='gini', max_depth=17)
  dt.fit(X_train, y_train)
  dt_y_pred = dt.predict(X_test)

  #print(classification_report(y_test, dt_y_pred))
  dt_accuracy_score = accuracy_score(y_test, dt_y_pred)
  print(f'DT accuracy: {dt_accuracy_score*100:.2f}%')
  if evaluation==True:
    total_time, total_cells = evaluate_predictions(dt_y_pred, y_test, test_times, test_cells)
  return dt,dt_y_pred

In [None]:
def run_rf(X_train, y_train,X_test,y_test,evaluation=False):
  rf = RandomForestClassifier(n_estimators=100, random_state=42)
  rf.fit(X_train, y_train)
  rf_y_pred = rf.predict(X_test)

  #print(classification_report(y_test, rf_y_pred))

  rf_accuracy_score = accuracy_score(y_test, rf_y_pred)
  print(f'RF accuracy: {rf_accuracy_score*100:.2f}%')
  if evaluation==True:
    total_time, total_cells = evaluate_predictions(rf_y_pred, y_test, test_times, test_cells)
  return rf,rf_y_pred

In [None]:
def run_xgb(X_train, y_train,X_test,y_test,evaluation=False):
  xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
  xgb.fit(X_train, y_train)
  xgb_y_pred = xgb.predict(X_test)

  # Display the classification report
  #print(classification_report(y_test, xgb_y_pred))

  xgb_accuracy_score = accuracy_score(y_test, xgb_y_pred)
  print(f'XGBoost accuracy: {xgb_accuracy_score*100:.2f}%')
  if evaluation==True:
    total_time, total_cells = evaluate_predictions(xgb_y_pred, y_test, test_times, test_cells)
  return xgb,xgb_y_pred

In [None]:
def run_ensemble(X_train, y_train,X_test,y_test,evaluation=False):
  xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
  rf = RandomForestClassifier(n_estimators=100, random_state=42)
  dt = DecisionTreeClassifier(criterion='gini', max_depth=17)
  knn = KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='ball_tree')
  svm = SVC(kernel='linear', C=1, probability=True, random_state=42)
  gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
  lr = LogisticRegression(max_iter=5000, multi_class='multinomial')

  ensemble = VotingClassifier(estimators=[ ('knn', knn),('gbm',gbm),('dt',dt)], voting='soft')
  ensemble.fit(X_train, y_train)

  ensemble_y_pred = ensemble.predict(X_test)
  ensemble_accuracy_score=accuracy_score(y_test, ensemble_y_pred)
  print(f'Ensemble accuracy: {ensemble_accuracy_score*100:.2f}%')
  if evaluation==True:
    total_time, total_cells = evaluate_predictions(ensemble_y_pred, y_test, test_times, test_cells)
  return ensemble_y_pred#

In [None]:
def run_ffn(X_train, y_train,X_test,y_test,evaluation=False):
  scaler = StandardScaler()
  FFN_X_train = scaler.fit_transform(X_train)
  FFN_X_test = scaler.transform(X_test)

  # Convert labels to one-hot encoding
  FFN_y_train = tf.keras.utils.to_categorical(y_train, num_classes=6)
  FFN_y_test = tf.keras.utils.to_categorical(y_test, num_classes=6)

  model = Sequential([
      Dense(128, activation='relu', input_shape=(FFN_X_train.shape[1],)),
      Dense(128, activation='relu'),
      Dense(64, activation='relu'),
      Dense(64, activation='relu'),
      Dense(32, activation='relu'),
      Dense(32, activation='relu'),
      Dense(6, activation='softmax')
  ])

  model.compile(optimizer='adam',
                loss='categorical_crossentropy',
                metrics=['accuracy'])

  history = model.fit(FFN_X_train, FFN_y_train, epochs=50, batch_size=32, validation_split=0.2,verbose=0)


  test_loss, test_acc = model.evaluate(FFN_X_test, FFN_y_test,verbose=1)

  print(f'FFN accuracy: {test_acc*100:.2f}%')
  predictions = model.predict(FFN_X_test)
  predicted_classes = np.argmax(predictions, axis=1)

  true_classes = np.argmax(FFN_y_test, axis=1)

  #report = classification_report(true_classes, predicted_classes, target_names=['0', '1', '2','3','4','5'])  # Adjust target_names based on your dataset
  #print(report)
  if evaluation==True:
    total_time, total_cells = evaluate_predictions(predicted_classes, y_test, test_times, test_cells)
  return predicted_classes

In [None]:

def run_brown(X_train, y_train,X_test,y_test,evaluation=False):

  def brown_select_variable_order(variable_info):
      variable_stats = [
          (
              info['max_degree'],  # Lowest degree first
              -info['prop'],       # Highest proportion of terms (use -prop for descending sort)
              -info['prop_mon'],   # Highest total degree term (use -prop_mon for descending sort)
              var                 # Variable name for tie-breaking by name
          )
          for var, info in variable_info.items()
      ]

      variable_stats.sort()
      variable_ordering = [var for _, _, _, var in variable_stats]

      ordering_str = ''.join(var.replace('x', '') for var in variable_ordering)
      ordering_map = {
          '123': 0, '132': 1, '213': 2,
          '231': 3, '312': 4, '321': 5
      }

      return ordering_map.get(ordering_str, None)


  def brown_calculate_ordering(row):

      variable_info = {
          'x1': {'max_degree': row['max_x1'], 'prop': row['prop_x1'], 'prop_mon': row['prop_mon_x1']},
          'x2': {'max_degree': row['max_x2'], 'prop': row['prop_x2'], 'prop_mon': row['prop_mon_x2']},
          'x3': {'max_degree': row['max_x3'], 'prop': row['prop_x3'], 'prop_mon': row['prop_mon_x3']},
      }
      ordering = brown_select_variable_order(variable_info)
      return ordering

  X_test['ordering_index'] = X_test.apply(brown_calculate_ordering, axis=1)
  matches = (X_test['ordering_index'] == X_test['label']).sum()
  total = len(X_test)
  brown_accuracy = matches / total * 100
  print(f"Brown Accuracy: {brown_accuracy}%")
  total_time, total_cells = evaluate_predictions(X_test['ordering_index'], X_test['label'], X_test['time'], X_test['cells'])

In [None]:
def list_parser(cell_string):
    try:
        stripped_string = cell_string.strip('[]')
        parsed_list = [float(item.strip()) for item in stripped_string.split(',') if item.strip()]
        return parsed_list
    except Exception as e:
        print(f"Error parsing cell_string: {cell_string} due to {e}")
        return []

def remove_duplicate_lowest(df_copy, column_name):
    df = df_copy.copy()
    df[column_name] = df[column_name].apply(list_parser)

    def has_duplicate_lowest(cells):
        if not cells:
            return False
        lowest_value = min(cells)
        return cells.count(lowest_value) > 1

    filtered_df = df[~df[column_name].apply(has_duplicate_lowest)]

    return filtered_df

def get_duplicate_lowest(df_copy, column_name):
    df = df_copy.copy()
    df[column_name] = df[column_name].apply(list_parser)

    def has_duplicate_lowest(cells):
        if not cells:
            return False
        lowest_value = min(cells)
        return cells.count(lowest_value) > 1

    filtered_df = df[df[column_name].apply(has_duplicate_lowest)]

    return filtered_df

def add_timeout_column(df):
    df['cells'] = df['cells'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    df['time'] = df['time'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    def adjust_entries(row):
        if 30 in row['time'] and 1000 in row['cells']:
            return 1
        return 0
    df['timeout'] = df.apply(adjust_entries, axis=1)
    return df

def remove_timeout_entries(df_copy):
    df=df_copy.copy()
    df_filtered = df[df['timeout'] == 0]
    return df_filtered

def adjust_time_and_cells(df):
    for index, row in df.iterrows():
        if isinstance(row['time'], list) and 30.0 in row['time']:
            max_time_not_30 = max([time for time in row['time'] if time != 30.0], default=0)
            new_time = max_time_not_30 + 30
            row['time'] = [new_time if time == 30.0 else time for time in row['time']]
            df.at[index, 'time'] = row['time']

        if isinstance(row['cells'], list) and 1000 in row['cells']:
            max_cells_not_1000 = max([cell for cell in row['cells'] if cell != 1000], default=0)
            new_cells = max_cells_not_1000 + 1000
            row['cells'] = [new_cells if cell == 1000 else cell for cell in row['cells']]
            df.at[index, 'cells'] = row['cells']

    return df

def remove_duplicate_names(df_copy):
  df=df_copy.copy().sample(frac=1)
  df['poly_name'] = df['input_file'].apply(lambda x: x.split('-')[0])

  df_unique = df.drop_duplicates(subset=['poly_name'], keep='first')
  df_unique=df_unique.drop('poly_name',axis=1)
  return df_unique

def get_times_and_cells(df):
  return df[["time","cells"]]

def rank_times(times_list):
    sorted_times = sorted((time, index) for index, time in enumerate(times_list))
    ranks = [0] * len(times_list)

    for rank, (time, index) in enumerate(sorted_times):
        ranks[index] = rank

    return ranks

def generate_final_df():
  file_path = '/content/gdrive/MyDrive/CAD_Project/final_merged_data.csv'
  merged_df = pd.read_csv(file_path)

  adjusted_df = add_timeout_column(merged_df)
  adjusted_df=adjust_time_and_cells(merged_df)

  output_file_path = '/content/gdrive/MyDrive/CAD_Project/extra_added_timeout_data_.csv'
  adjusted_df.to_csv(output_file_path, index=False)

In [None]:
def apply_reductions(df_og,shuffle,remove_dupe_lowest,remove_timeouts,remove_dupe_names,frac_to_keep):
  df=copy.deepcopy(df_og)
  if shuffle==True:
    df=df.sample(frac=1)
  if remove_dupe_lowest==True:
    remove_duplicate_lowest(df,'time')
  if remove_timeouts==True:
    df=remove_timeout_entries(df)
  if remove_dupe_names==True:
    df=remove_duplicate_names(df)
  return df.sample(frac=frac_to_keep)

In [None]:

from sklearn.model_selection import train_test_split

def prepare_train_test_data(original_df,extra_features=False):
    if extra_features==False:
      df_copy = original_df.copy()
      df_copy = df_copy.drop(columns=['Unnamed: 0'])
      train_df, test_df = train_test_split(df_copy, test_size=0.2, random_state=42)
      y_train = train_df['label']
      y_test = test_df['label']
      feature_columns_to_drop = ['cells', 'label', 'file_id_x', 'file_id_y', 'input_file', 'label_file', 'time', 'timeout']
      X_train = train_df.drop(feature_columns_to_drop, axis=1)
      X_test = test_df.drop(feature_columns_to_drop, axis=1)
      return X_train, y_train, X_test, y_test
    else:
      df_copy = original_df.copy()
      df_copy = df_copy.drop(columns=['file_id_x'])
      train_df, test_df = train_test_split(df_copy, test_size=0.2, random_state=42)
      y_train = train_df['label_y']
      y_test = test_df['label_y']
      feature_columns_to_drop = ['label_y']
      X_train = train_df.drop(feature_columns_to_drop, axis=1)
      X_test = test_df.drop(feature_columns_to_drop, axis=1)
      return X_train, y_train, X_test, y_test

In [None]:
def run_all(X_train, y_train, og_X_test, og_y_test,lr,gbm,svm,knn,dt,xgb,ensemble,ffn,brown):
    if lr:
        lr,lr_y_pred=run_lr(X_train, y_train,og_X_test,og_y_test)
    if gbm:
        gbm,gbm_y_pred=run_gbm(X_train, y_train,og_X_test,og_y_test)
    if svm:
        svm,svm_y_pred=run_svm(X_train, y_train,og_X_test,og_y_test)
    if knn:
        knn,knn_y_pred=run_knn(X_train, y_train,og_X_test,og_y_test)
    if dt:
        dt,dt_y_pred=run_dt(X_train, y_train,og_X_test,og_y_test)
    if xgb:
        xgb,xgb_y_pred=run_xgb(X_train, y_train,og_X_test,og_y_test)
    #if ensemble:
        #ensemble,ensemble_y_pred=run_ensemble(X_train, y_train,X_test,y_test)
    if ffn:
        ffn_y_pred=run_ffn(X_train, y_train,og_X_test,og_y_test)
    #if brown:
        #run_brown(X_train, y_train,X_test,y_test)

In [None]:
file_path = '/content/gdrive/MyDrive/CAD_Project/extra_added_timeout_data_.csv'

og_df = pd.read_csv(file_path)
augmented_df= og_df.sort_values(by='file_id_x')
split_ratio = 0.8

split_index = int(len(augmented_df) * split_ratio)

og_train_df = og_df[:split_index]
og_test_df = og_df[split_index:]

og_train_df = og_train_df.sample(frac=1)
og_test_df = og_test_df.sample(frac=1)

augmented_train_df = augmented_df[:split_index]
augmented_test_df = augmented_df[split_index:]

augmented_train_df = augmented_train_df.sample(frac=1)
augmented_test_df = augmented_test_df.sample(frac=1)

balanced_train_df=remove_duplicate_names(augmented_train_df)
balanced_test_df=remove_duplicate_names(augmented_test_df)

balanced_train_df = balanced_train_df.sample(frac=1)
balanced_test_df = balanced_test_df.sample(frac=1)

In [None]:
duplicate_times=get_duplicate_lowest(og_df,'time')

In [None]:
print(duplicate_times.shape)

(6336, 20)


In [None]:
print(balanced_train_df.shape,augmented_train_df.shape)

(5516, 20) (33095, 20)


In [None]:
unbalanced_file_path = '/content/gdrive/MyDrive/CAD_Project/metitarski_original_processed.csv'

unbalanced_df = pd.read_csv(unbalanced_file_path, delimiter='\t')

In [None]:
unbalanced_df['numeric_id'] = unbalanced_df['input_file'].str.extract('(\d+)').astype(int)

unbalanced_df= unbalanced_df.sort_values(by='numeric_id')
split_ratio = 0.8

split_index = int(len(unbalanced_df) * split_ratio)

unbalanced_train_df = unbalanced_df[:split_index]
unbalanced_test_df = unbalanced_df[split_index:]

unbalanced_train_df = unbalanced_train_df.sample(frac=1)
unbalanced_test_df = unbalanced_test_df.sample(frac=1)

In [None]:
print(unbalanced_train_df.shape,balanced_train_df.shape,augmented_train_df.shape,og_train_df.shape)

(5516, 17) (5516, 20) (33095, 20) (33095, 20)


In [None]:
common_train = pd.merge(og_train_df, augmented_test_df, on='input_file', how='inner')
print(f'Number of identical rows in training sets after re-merging: {len(common_train)}')

og_no_pollution_train_df = og_train_df[~og_train_df['input_file'].isin(common_train['input_file'])]

common_train = pd.merge(og_train_df, augmented_test_df, on='input_file', how='inner')
print(f'Number of identical rows in training sets after removal: {len(common_train)}')


Number of identical rows in training sets after re-merging: 6621
Number of identical rows in training sets after removal: 6621


In [None]:
common_train = pd.merge(og_train_df, balanced_test_df, on='input_file', how='inner')
print(f'Number of identical rows in training sets after re-merging: {len(common_train)}')

og_train_df = og_train_df[~og_train_df['input_file'].isin(common_train['input_file'])]

common_train = pd.merge(og_train_df, balanced_test_df, on='input_file', how='inner')
print(f'Number of identical rows in training sets after removal: {len(common_train)}')


Number of identical rows in training sets after re-merging: 1104
Number of identical rows in training sets after removal: 0


In [None]:
print(og_train_df.shape,augmented_train_df.shape,balanced_train_df.shape)

(31991, 20) (33095, 20) (5516, 20)


In [None]:
feature_columns_to_drop = ['Unnamed: 0','cells', 'label', 'file_id_x', 'file_id_y', 'input_file', 'label_file', 'time', 'timeout']

og_no_pollution_y_train = og_no_pollution_train_df['label']
og_no_pollution_X_train = og_no_pollution_train_df.drop(feature_columns_to_drop, axis=1)
og_no_pollution_y_test = og_no_pollution_train_df['label']
og_no_pollution_X_test = og_no_pollution_train_df.drop(feature_columns_to_drop, axis=1)

In [None]:
feature_columns_to_drop = ['Unnamed: 0','cells', 'label', 'file_id_x', 'file_id_y', 'input_file', 'label_file', 'time', 'timeout']

og_y_train = og_train_df['label']
og_X_train = og_train_df.drop(feature_columns_to_drop, axis=1)
og_y_test = og_test_df['label']
og_X_test = og_test_df.drop(feature_columns_to_drop, axis=1)

In [None]:
augmented_y_train = augmented_train_df['label']
augmented_X_train = augmented_train_df.drop(feature_columns_to_drop, axis=1)
augmented_y_test = augmented_test_df['label']
augmented_X_test = augmented_test_df.drop(feature_columns_to_drop, axis=1)

In [None]:
balanced_y_train = balanced_train_df['label']
balanced_X_train = balanced_train_df.drop(feature_columns_to_drop, axis=1)
balanced_y_test = balanced_test_df['label']
balanced_X_test = balanced_test_df.drop(feature_columns_to_drop, axis=1)

In [None]:
less_feature_columns_to_drop = ['Unnamed: 0', 'label', 'file_id', 'input_file', 'label_file','numeric_id']

unbalanced_y_train = unbalanced_train_df['label']
unbalanced_X_train = unbalanced_train_df.drop(less_feature_columns_to_drop, axis=1)
unbalanced_y_test = unbalanced_test_df['label']
unbalanced_X_test = unbalanced_test_df.drop(less_feature_columns_to_drop, axis=1)

**Data Pollution Experiment**




In [None]:
print(augmented_X_train.shape,balanced_X_train.shape)

(33095, 11) (5516, 11)


In [None]:
run_all(augmented_X_train, augmented_y_train,balanced_X_test,balanced_y_test,lr=False, gbm=False, svm=False, knn=True, dt=True, xgb=False, ensemble=False, ffn=True, brown=False)
print("-----------------------------")
run_all(balanced_X_train, balanced_y_train,balanced_X_test,balanced_y_test,lr=False, gbm=False, svm=False, knn=True, dt=True, xgb=False, ensemble=False, ffn=True, brown=False)
print("-----------------------------")
run_all(unbalanced_X_train, unbalanced_y_train,balanced_X_test,balanced_y_test,lr=False, gbm=False, svm=False, knn=True, dt=True, xgb=False, ensemble=False, ffn=True, brown=False)
print("-----------------------------")
run_all(og_X_train, og_y_train,balanced_X_test,balanced_y_test,lr=False, gbm=False, svm=False, knn=True, dt=True, xgb=False, ensemble=False, ffn=True, brown=False)
print("-----------------------------")
run_all(og_no_pollution_X_train, og_no_pollution_y_train,balanced_X_test,balanced_y_test,lr=False, gbm=False, svm=False, knn=True, dt=True, xgb=False, ensemble=False, ffn=True, brown=False)
print("-----------------------------")

KNN accuracy: 27.70%
DT accuracy: 35.68%
FFN accuracy: 42.64%
-----------------------------
KNN accuracy: 24.95%
DT accuracy: 35.24%
FFN accuracy: 32.70%
-----------------------------
KNN accuracy: 15.45%
DT accuracy: 24.29%
FFN accuracy: 30.09%
-----------------------------
KNN accuracy: 58.81%
DT accuracy: 60.77%
FFN accuracy: 62.00%
-----------------------------
KNN accuracy: 26.69%
DT accuracy: 38.36%
FFN accuracy: 41.77%
-----------------------------


In [None]:
run_all(augmented_X_train, augmented_y_train,balanced_X_test,balanced_y_test,lr=True, gbm=False, svm=False, knn=True, dt=True, xgb=True, ensemble=False, ffn=False, brown=False)
print("-----------------------------")
run_all(balanced_X_train, balanced_y_train,balanced_X_test,balanced_y_test,lr=True, gbm=False, svm=False, knn=True, dt=True, xgb=True, ensemble=False, ffn=False, brown=False)
print("-----------------------------")
run_all(unbalanced_X_train, unbalanced_y_train,balanced_X_test,balanced_y_test,lr=True, gbm=False, svm=False, knn=True, dt=True, xgb=True, ensemble=False, ffn=False, brown=False)
print("-----------------------------")
run_all(og_X_train, og_y_train,balanced_X_test,balanced_y_test,lr=True, gbm=False, svm=False, knn=True, dt=True, xgb=True, ensemble=False, ffn=False, brown=False)
print("-----------------------------")
run_all(og_no_pollution_X_train, og_no_pollution_y_train,balanced_X_test,balanced_y_test,lr=True, gbm=False, svm=False, knn=True, dt=True, xgb=True, ensemble=False, ffn=False, brown=False)
print("-----------------------------")

LR accuracy: 41.12%
KNN accuracy: 27.48%
DT accuracy: 35.10%
XGBoost accuracy: 46.92%
-----------------------------
LR accuracy: 42.57%
KNN accuracy: 29.95%
DT accuracy: 38.00%
XGBoost accuracy: 44.16%
-----------------------------
LR accuracy: 23.71%
KNN accuracy: 14.21%
DT accuracy: 24.00%
XGBoost accuracy: 37.13%
-----------------------------
LR accuracy: 45.61%
KNN accuracy: 58.52%
DT accuracy: 60.19%
XGBoost accuracy: 60.84%
-----------------------------
LR accuracy: 40.97%
KNN accuracy: 26.25%
DT accuracy: 37.64%
XGBoost accuracy: 46.05%
-----------------------------


In [None]:
run_all(augmented_X_train, augmented_y_train,augmented_X_test,augmented_y_test,lr=True, gbm=False, svm=False, knn=True, dt=True, xgb=True, ensemble=False, ffn=False, brown=False)
print("-----------------------------")
run_all(balanced_X_train, balanced_y_train,augmented_X_test,augmented_y_test,lr=True, gbm=False, svm=False, knn=True, dt=True, xgb=True, ensemble=False, ffn=False, brown=False)
print("-----------------------------")
run_all(unbalanced_X_train, unbalanced_y_train,augmented_X_test,augmented_y_test,lr=True, gbm=False, svm=False, knn=True, dt=True, xgb=True, ensemble=False, ffn=False, brown=False)
print("-----------------------------")
run_all(og_X_train, og_y_train,augmented_X_test,augmented_y_test,lr=True, gbm=False, svm=False, knn=True, dt=True, xgb=True, ensemble=False, ffn=False, brown=False)
print("-----------------------------")
run_all(og_no_pollution_X_train, og_no_pollution_y_train,augmented_X_test,augmented_y_test,lr=True, gbm=False, svm=False, knn=True, dt=True, xgb=True, ensemble=False, ffn=False, brown=False)
print("-----------------------------")

LR accuracy: 43.12%
KNN accuracy: 26.24%
DT accuracy: 35.39%
XGBoost accuracy: 48.05%
-----------------------------
LR accuracy: 44.16%
KNN accuracy: 29.36%
DT accuracy: 35.91%
XGBoost accuracy: 44.55%
-----------------------------
LR accuracy: 23.94%
KNN accuracy: 15.12%
DT accuracy: 24.76%
XGBoost accuracy: 37.18%
-----------------------------
LR accuracy: 47.34%
KNN accuracy: 70.67%
DT accuracy: 69.60%
XGBoost accuracy: 68.24%
-----------------------------
LR accuracy: 42.49%
KNN accuracy: 27.19%
DT accuracy: 35.79%
XGBoost accuracy: 47.20%
-----------------------------


In [None]:
run_all(augmented_X_train, augmented_y_train,unbalanced_X_test,unbalanced_y_test,lr=True, gbm=False, svm=False, knn=True, dt=True, xgb=True, ensemble=False, ffn=False, brown=False)
print("-----------------------------")
run_all(balanced_X_train, balanced_y_train,unbalanced_X_test,unbalanced_y_test,lr=True, gbm=False, svm=False, knn=True, dt=True, xgb=True, ensemble=False, ffn=False, brown=False)
print("-----------------------------")
run_all(unbalanced_X_train, unbalanced_y_train,unbalanced_X_test,unbalanced_y_test,lr=True, gbm=False, svm=False, knn=True, dt=True, xgb=True, ensemble=False, ffn=False, brown=False)
print("-----------------------------")
run_all(og_X_train, og_y_train,unbalanced_X_test,unbalanced_y_test,lr=True, gbm=False, svm=False, knn=True, dt=True, xgb=True, ensemble=False, ffn=False, brown=False)
print("-----------------------------")
run_all(og_no_pollution_X_train, og_no_pollution_y_train,unbalanced_X_test,unbalanced_y_test,lr=True, gbm=False, svm=False, knn=True, dt=True, xgb=True, ensemble=False, ffn=False, brown=False)
print("-----------------------------")

LR accuracy: 41.04%
KNN accuracy: 22.77%
DT accuracy: 32.49%
XGBoost accuracy: 45.25%
-----------------------------
LR accuracy: 40.32%
KNN accuracy: 28.93%
DT accuracy: 27.05%
XGBoost accuracy: 37.49%
-----------------------------
LR accuracy: 33.65%
KNN accuracy: 22.70%
DT accuracy: 41.04%
XGBoost accuracy: 43.44%
-----------------------------
LR accuracy: 42.42%
KNN accuracy: 68.02%
DT accuracy: 68.09%
XGBoost accuracy: 68.17%
-----------------------------
LR accuracy: 41.70%
KNN accuracy: 26.47%
DT accuracy: 24.15%
XGBoost accuracy: 41.77%
-----------------------------


In [None]:
run_all(augmented_X_train, augmented_y_train,og_X_test,og_y_test,lr=True, gbm=False, svm=False, knn=True, dt=True, xgb=True, ensemble=False, ffn=False, brown=False)
print("-----------------------------")
run_all(balanced_X_train, balanced_y_train,og_X_test,og_y_test,lr=True, gbm=False, svm=False, knn=True, dt=True, xgb=True, ensemble=False, ffn=False, brown=False)
print("-----------------------------")
run_all(unbalanced_X_train, unbalanced_y_train,og_X_test,og_y_test,lr=True, gbm=False, svm=False, knn=True, dt=True, xgb=True, ensemble=False, ffn=False, brown=False)
print("-----------------------------")
run_all(og_X_train, og_y_train,og_X_test,og_y_test,lr=True, gbm=False, svm=False, knn=True, dt=True, xgb=True, ensemble=False, ffn=False, brown=False)
print("-----------------------------")
run_all(og_no_pollution_X_train, og_no_pollution_y_train,og_X_test,og_y_test,lr=True, gbm=False, svm=False, knn=True, dt=True, xgb=True, ensemble=False, ffn=False, brown=False)
print("-----------------------------")

LR accuracy: 43.17%
KNN accuracy: 57.40%
DT accuracy: 58.44%
XGBoost accuracy: 60.58%
-----------------------------
LR accuracy: 40.62%
KNN accuracy: 46.88%
DT accuracy: 50.10%
XGBoost accuracy: 53.73%
-----------------------------
LR accuracy: 20.05%
KNN accuracy: 24.03%
DT accuracy: 27.33%
XGBoost accuracy: 35.82%
-----------------------------
LR accuracy: 42.40%
KNN accuracy: 51.99%
DT accuracy: 52.30%
XGBoost accuracy: 55.02%
-----------------------------
LR accuracy: 42.47%
KNN accuracy: 45.09%
DT accuracy: 48.67%
XGBoost accuracy: 51.45%
-----------------------------


**Feature Comparison**

In [34]:
def run_all(train_df,test_df,lr,gbm,svm,knn,dt,xgb,ensemble,ffn,brown,extra_features=False):
    print(train_df.shape,test_df.shape)
    X_train, y_train, X_test, y_test = prepare_train_test_data(train_df,extra_features)
    og_X_train, og_y_train, og_X_test, og_y_test = prepare_train_test_data(test_df,extra_features)
    if lr:
        lr,lr_y_pred=run_lr(X_train, y_train,og_X_test,og_y_test)
    if gbm:
        gbm,gbm_y_pred=run_gbm(X_train, y_train,og_X_test,og_y_test)
    if svm:
        svm,svm_y_pred=run_svm(X_train, y_train,og_X_test,og_y_test)
    if knn:
        knn,knn_y_pred=run_knn(X_train, y_train,X_test,y_test)
    if dt:
        dt,dt_y_pred=run_dt(X_train, y_train,og_X_test,og_y_test)
    if xgb:
        xgb,xgb_y_pred=run_xgb(X_train, y_train,og_X_test,og_y_test)
    if ffn:
        ffn_y_pred=run_ffn(X_train, y_train,og_X_test,og_y_test)

In [49]:
def gen_extra_features_df():
  new_file_path = '/content/gdrive/MyDrive/CAD_Project/extra_features_data'
  extra_features_df = pd.read_csv(new_file_path)
  final_df = pd.read_csv('/content/gdrive/MyDrive/CAD_Project/extra_added_timeout_data_.csv')
  selected_columns = ['label'] + list(extra_features_df.columns[2:])
  filtered_df = extra_features_df[selected_columns]

  joined_df = pd.merge(final_df, extra_features_df, left_on='input_file', right_on='name', how='inner')
  joined_df=joined_df.drop(['Unnamed: 0', 'label_file', 'input_file','nr_polynomials', 'max_total_degree', 'max_x1', 'max_x2', 'max_x3', 'prop_x1', 'prop_x2', 'prop_x3', 'prop_mon_x1', 'prop_mon_x2', 'prop_mon_x3', 'label_x', 'file_id_y'],axis=1)
  feature_columns_to_drop = ['cells', 'time', 'timeout','name']
  joined_df = joined_df.drop(feature_columns_to_drop, axis=1)
  return joined_df

extra_features_df=gen_extra_features_df()

split_index = int(len(extra_features_df) * split_ratio)

og_train_df = extra_features_df[:split_index]
og_test_df = extra_features_df[split_index:]
feature_columns_to_drop = ['Unnamed: 0','cells', 'label', 'file_id_x', 'file_id_y', 'input_file', 'label_file', 'time', 'timeout']

y_train = og_train_df['label_y']
y_test = og_test_df['label_y']
X_train = og_train_df.drop(['label_y'], axis=1)
X_test = og_test_df.drop(['label_y'], axis=1)

In [51]:
run_all(extra_features_df, extra_features_df,lr=True, gbm=False, svm=False, knn=True, dt=True, xgb=True, ensemble=False, ffn=True, brown=False,extra_features=True)
print("-----------------------------")

(41369, 44) (41369, 44)
LR accuracy: 49.09%
KNN accuracy: 56.56%
DT accuracy: 58.45%
XGBoost accuracy: 59.08%
FFN accuracy: 59.96%
-----------------------------


In [None]:
file_path = '/content/gdrive/MyDrive/CAD_Project/extra_added_timeout_data_.csv'

og_df = pd.read_csv(file_path)

run_all(og_df, og_df,lr=True, gbm=False, svm=False, knn=True, dt=True, xgb=True, ensemble=False, ffn=True, brown=False,extra_features=False)
print("-----------------------------")

(41369, 20) (41369, 20)
LR accuracy: 42.12%
KNN accuracy: 54.12%
DT accuracy: 54.44%
XGBoost accuracy: 56.55%
FFN accuracy: 57.60%
-----------------------------
