# **Section 1 : Importing necessary files**



*   The 'train_loss.csv' file has an additional row which consist of batch_name, that is omitted
*   The first four columns in all the data files represent the vlaues for the hyperparamaters, they are also omitted as they are taken from the 'HP_space.csv'



In [None]:
import pandas as pd
import numpy as np

hp_values = pd.read_csv('HP_space.csv')
train_loss = pd.read_csv('train_loss.csv').iloc[1:,4:]
eval_loss = pd.read_csv('eval_loss.csv').iloc[:,4:]
eval_acc = pd.read_csv('eval_acc.csv').iloc[:,4:]


# **Section 2 : Handling Missing Values**

To handle missing values in the data files, NaN values are replaced with the mean of the column containing the respective NaN value.



In [9]:
def handleNaN(data):
  for col in data.columns:
    mean_value = data[col].astype(float).mean()
    data[col].fillna(mean_value, inplace=True)

  return data

train_loss = handleNaN(train_loss)
eval_loss = handleNaN(eval_loss)
eval_acc = handleNaN(eval_acc)



In [10]:
train_records = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 160, 162, 164, 166, 168, 170, 172, 174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194, 196, 198, 200, 202]
val_records= [3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63, 67, 71, 75, 79, 83, 87, 91, 95, 99, 103, 107, 111, 115, 119, 123, 127, 131, 135, 139, 143, 147, 151, 155, 159, 163, 167, 171, 175, 179, 183, 187, 191, 195, 199]
test_records= [5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61, 65, 69, 73, 77, 81, 85, 89, 93, 97, 101, 105, 109, 113, 117, 121, 125, 129, 133, 137, 141, 145, 149, 153, 157, 161, 165, 169, 173, 177, 181, 185, 189, 193, 197, 201]



# **Section 3:Dataset Generation**
Here, we generate the training, validation and testing dataset according the value of E provided to the function generate_dataset(E)

In [11]:
def generate_dataset(E):

  #number of epochs for which the values have to be considered
  M = 150

  def create_dataset(data_records):
    dataset = pd.DataFrame()
    for record_index in data_records:
      hyperparameters = hp_values.iloc[record_index-1,:]
      train_loss_values = train_loss.iloc[record_index-1,0:E*50]
      eval_loss_values = eval_loss.iloc[record_index-1,0:E]
      eval_acc_values = eval_acc.iloc[record_index-1,0:E]
      M_value = pd.Series([M])
      output_labels = pd.Series([eval_acc.iloc[record_index-1,-1]])
      row = pd.concat([hyperparameters,train_loss_values,eval_loss_values,eval_acc_values,M_value,output_labels],axis=0)
      row = row.to_frame().T
      dataset = pd.concat([dataset,row],axis=0)

    dataset.columns = dataset.columns.astype(str)

    return dataset


  train_dataset = create_dataset(train_records)
  val_dataset = create_dataset(val_records)
  test_dataset = create_dataset(test_records)

  return train_dataset,val_dataset,test_dataset





# **Section 4 : Splitting Input-Output Data for the model**

In [14]:
def data_split(train_dataset,val_dataset,test_dataset):
  X_train = train_dataset.iloc[:,:-1]
  X_test = test_dataset.iloc[:,:-1]
  X_val = val_dataset.iloc[:,:-1]

  Y_train = train_dataset.iloc[:,-1]
  Y_test = test_dataset.iloc[:,-1]
  Y_val = val_dataset.iloc[:,-1]

  return X_train,X_val,X_test,Y_train,Y_test,Y_val


# **Section 5 : SVR Hyperparameter Tuning**

In [None]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

def get_svm_hyperparameter(X_train,Y_train,X_val,Y_val):
  best_params={}
  lowest_mean_error=None
  kernel = ['linear','rbf']
  C = [0.1,1,10,100]

  for k in kernel:
    for c in C:
      svm_regressor = svm.SVR(kernel=k,C=c)
      svm_regressor.fit(X_train,Y_train)
      Y_pred = svm_regressor.predict(X_val)
      mean_error = mean_squared_error(Y_val,Y_pred)
      if lowest_mean_error is None or mean_error < lowest_mean_error:
              lowest_mean_error = mean_error
              best_params = {'kernel': k, 'C': c}

  return best_params




# **Training SVM with optimal hyperparameters**

In [19]:
def svm_prediction(X_train,Y_train,X_val,Y_val,X_test,Y_test,params):
  best_svm_regressor = svm.SVR(kernel=params['kernel'],C=params['C'])
  combined_X = pd.concat([X_train,X_val])
  combined_Y = pd.concat([Y_train,Y_val])

  best_svm_regressor.fit(combined_X,combined_Y)

  Y_pred_svm = best_svm_regressor.predict(X_test)
  mean_error = mean_squared_error(Y_test,Y_pred_svm)
  print(mean_error)

  return best_svm_regressor,Y_pred_svm,mean_error

0.003095184041613156


# **Section 6 : RF Regressor Hyperparameter Tuning**

In [24]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

def get_rf_hyperparameter(X_train,Y_train,X_val,Y_val):
  n_estimators = [10,50,100,200]
  criterion = ['squared_error', 'absolute_error', 'friedman_mse', 'poisson']

  best_params={}
  lowest_mean_error=None

  for n in n_estimators:
    for c in criterion:
      rf_regressor = RandomForestRegressor(n_estimators=n,criterion=c,random_state=42)
      rf_regressor.fit(X_train,Y_train)
      Y_pred = rf_regressor.predict(X_val)
      mean_error = mean_squared_error(Y_val,Y_pred)

      if lowest_mean_error is None or mean_error < lowest_mean_error:
              lowest_mean_error = mean_error
              best_params = {'n_estimators': n, 'criterion': c}

  return best_params

# **Training RF Regressor with optimal hyperparameters**

In [32]:
def rf_prediction(X_train,Y_train,X_val,Y_val,X_test,Y_test,params):
  best_rf_regressor = RandomForestRegressor(n_estimators=params['n_estimators'],criterion=params['criterion'],random_state=42)
  combined_X = pd.concat([X_train,X_val])
  combined_Y = pd.concat([Y_train,Y_val])
  best_rf_regressor.fit(combined_X,combined_Y)
  Y_pred_rf = best_rf_regressor.predict(X_test)
  mean_error = mean_squared_error(Y_test,Y_pred_rf)
  print(mean_error)

  return best_rf_regressor,Y_pred_rf,mean_error


# **Section 7 : Training and Saving models**
This section trains and saves the model for varying values of E - representing the first E epochs from which the data has been taken.

**Naming Convention**

As an example, the model for E=10 are saved as 'rf_acc_model_10'.

In [None]:
import joblib

def train_save_models():
  E_values = [5,10,20,30,60]
  for e in E_values:
    train_dataset,val_dataset,test_dataset = generate_dataset(e)
    X_train,X_val,X_test,Y_train,Y_test,Y_val = data_split(train_dataset,val_dataset,test_dataset)

    rf_params = get_rf_hyperparameter(X_train,Y_train,X_val,Y_val)
    model,predicted_values,rf_mean_error = rf_prediction(X_train,Y_train,X_val,Y_val,X_test,Y_test,rf_params)
    model_name = 'rf_acc_model_' + str(e)
    joblib.dump(model,model_name)




# **Section 8 : Predictions from Saved Models**
For each of the E values, the corresponding model is loaded and accuracy predictions and mean squared error are computed and stored.

In [57]:
import joblib
predictions=[]
error_values=[]
E_values=[5,10,20,30,60]
for e in E_values:
  train_dataset,val_dataset,test_dataset = generate_dataset(e)
  X_train,X_val,X_test,Y_train,Y_test,Y_val = data_split(train_dataset,val_dataset,test_dataset)
  model_file = 'rf_acc_model_' + str(e)
  loaded_model = joblib.load(model_file)
  model_predictions = loaded_model.predict(X_test)
  predictions.append(model_predictions)
  error_values.append(mean_squared_error(Y_test,model_predictions))



In [64]:

col_names = ['Epoch_5', 'Epoch_10', 'Epoch_20','Epoch_30','Epoch_60']
prediction_df = pd.DataFrame(predictions).T
prediction_df.columns = col_names
prediction_df['True Values']=Y_test.values

prediction_df

Unnamed: 0,Epoch_5,Epoch_10,Epoch_20,Epoch_30,Epoch_60,True Values
0,0.728412,0.738658,0.72837,0.72946,0.73007,0.7289
1,0.727006,0.727136,0.72636,0.72682,0.72875,0.7331
2,0.700672,0.69812,0.69021,0.68848,0.68565,0.6819
3,0.678042,0.691556,0.68955,0.69064,0.68884,0.6887
4,0.711532,0.699868,0.7036,0.70547,0.70446,0.7028
5,0.671492,0.668214,0.67334,0.66982,0.6729,0.6647
6,0.704708,0.701148,0.69623,0.69925,0.70182,0.7017
7,0.720222,0.719682,0.72483,0.7248,0.72729,0.718
8,0.634222,0.626928,0.59629,0.61204,0.6122,0.6069
9,0.67406,0.678736,0.67987,0.68665,0.6814,0.6804


In [59]:
error_values

[0.0003659964948344581,
 0.0002368833360881117,
 0.00013052221600000062,
 0.00012957563249830484,
 0.00011506668472716022]

# **Saving Predictions to CSV**

In [63]:
prediction_df.to_csv('Accuracy_predictions.csv',index=False)