<a href="https://colab.research.google.com/github/shellfish1/API/blob/master/Copy_of_Problem1_A4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import random
import numpy as np
import sklearn as skl

from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from tensorflow import keras
from keras.layers import Dense
from keras import metrics
from keras.utils import to_categorical
from matplotlib import pyplot as plt
from keras.models import Sequential


In [None]:
def generate_data(n, x1_low, x1_high, x2_low, x2_high, ulc1, ulc2, ulc3, side, fpp, fnp):
    x_min = [x1_low, x2_low]
    x_max = [x1_high, x2_high]
    labels = np.empty([n, 1])

    features = np.random.uniform(low=x_min, high=x_max, size=(n, 2))

    for i in range(0, n):
      labels[i] = label_point(features[i][0], features[i][1], side, ulc1, ulc2, ulc3, fpp, fnp)

    return features, labels


def label_point(x1, x2, side, ulc1, ulc2, ulc3, fpp, fnp):
    in1 = lambda x, y: ulc1[0] <= x <= ulc1[0] + side and ulc1[1] - side <= y <= ulc1[1]
    in2 = lambda x, y: ulc2[0] <= x <= ulc2[0] + side and ulc2[1] - side <= y <= ulc2[1]
    in3 = lambda x, y: ulc3[0] <= x <= ulc3[0] + side and ulc3[1] - side <= y <= ulc3[1]

    sign = 1 if in1(x1, x2) or in2(x1, x2) or in3(x1, x2) else 0
    # TODO: Introduce noise or asymmetry in data, by flipping the flag for asymmetric noise
    if sign == 1:
        # Noise is 3% false negatives
        sign = 1 if random.randint(0, 100) < fpp else 0
    else:
        # Noise is 1% false positives,
        sign = 0 if random.randint(0, 100) < fnp else 1
    return sign


def plot_data(features, labels):
    # Use this method to plot the input data red(o) for points within squares (+), green(x) for points outside squares (-)
    x1_1 = []
    x2_1 = []

    x1_2 = []
    x2_2 = []

    for i in range(0, len(features)):
        x1 = features[i][0]
        x2 = features[i][1]
        sign = labels[i]

        if sign == 1:
            x1_1.append(x1)
            x2_1.append(x2)
        else:
            x1_2.append(x1)
            x2_2.append(x2)

    plt.plot(x1_1, x2_1, marker="o", markersize=5, markeredgecolor="red", markerfacecolor="red", linestyle='')
    plt.plot(x1_2, x2_2, marker="x", markersize=5, markeredgecolor="green", markerfacecolor="green", linestyle='')
    plt.show()


In [None]:
def predict_threshold_model(model, test_features, test_labels):
  best_roc_score = 0
  best_threshold = 0
  test_roc = []
  print("ROC Score: ", skl.metrics.roc_auc_score(test_labels, model.predict(test_features)))
  for t in [0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8]:
    label_predictions =  np.where( model.predict(test_features) > 0.2, 1, 0 )
    score = skl.metrics.roc_auc_score(test_labels, label_predictions)
    test_roc.append(score)
    if score >= best_roc_score:
      best_threshold = t
      best_roc_score = score
  print("All ROC scores: ", test_roc)
  print("Best threshold is: ", best_threshold)
  return best_threshold


def predict_using_kfold_model(models, test_features, thresholds):
  
  temp_predictions = np.empty([len(test_features), len(models)])
  
  for j in range(0, len(models)):
    thresholded_prediction = np.where( models[j].predict(test_features) > thresholds[j], 1, 0 )
    for i in range(0, len(thresholded_prediction)):
      temp_predictions[i][j] = thresholded_prediction[i][0]
  
  final_predictions = np.empty([len(test_features), 1])
  
  for i in range(0, len(test_features)):
    num_zeroes = 0
    num_ones = 0
    for j in range(0, len(models)):
      if temp_predictions[i][j] == 1:
        num_ones += 1
      else:
        num_zeroes += 1
    label = 1 if num_ones > num_zeroes else 0
    final_predictions[i][0] = label

  return final_predictions

def compute_scores(labels_true, labels_pred):
  accuracy_score = skl.metrics.accuracy_score(labels_true, labels_pred, normalize=True)
  balanced_accuracy_score = skl.metrics.balanced_accuracy_score(labels_true, labels_pred)
  roc_auc_score = skl.metrics.roc_auc_score(labels_true, labels_pred)
  return [accuracy_score, balanced_accuracy_score, roc_auc_score]


def train_model(total_features, total_labels, l1_n, l2_n):
    
    kf = KFold(n_splits=2)
    models = []
    thresholds = []
    
    for train_index, test_index in kf.split(total_features, total_labels):
      # K fold cross validation
      training_features, test_features = total_features[train_index], total_features[test_index]
      training_labels, test_labels = total_labels[train_index], total_labels[test_index]

      classifier = Sequential()
      if l2_n != 0:
          classifier.add(Dense(l1_n, activation='tanh', input_dim=2))
          classifier.add(Dense(l2_n, activation='tanh'))
          classifier.add(Dense(1, activation='sigmoid'))
      else:
          classifier.add(Dense(l1_n, activation='tanh', input_dim=2))
          classifier.add(Dense(1, activation='sigmoid'))

      classifier.compile(optimizer='adam', loss='binary_crossentropy', run_eagerly=True, metrics=['accuracy'])
      classifier.fit(training_features, training_labels, batch_size=10, epochs=100)
      # Need to compute metrics with respect to test_labels and training_labels
      models.append(classifier)
      thresholds.append(predict_threshold_model(classifier, test_features, test_labels))
    
    # Computing averaged predictions over k folds
    labels_pred = predict_using_kfold_model(models, total_features, thresholds)
    labels_true = total_labels  

    # Computing metrics over predictions
    scores = compute_scores(labels_true, labels_pred)

    return models, scores


In [None]:

def main():
    total_features, total_labels = generate_data(1500, -6, 6, -4, 4, [-4, 3], [-2, -1], [2, 1], 3, 96, 98) 
    validation_features, validation_labels = generate_data(100000, -6, 6, -4, 4, [-4, 3], [-2, -1], [2, 1], 3, 100, 100)
    # #     h1 ∈ {1, 4, 12} and h2 ∈ {0, 3},
    h1 = [1, 4, 12]
    h2 = [0, 3]

    model, scores = train_model(total_features, total_labels, 1, 0)
    print(scores)

    # results = []
    # # h1 | h2 | D | FIG | TEST ROC | TEST ACC | TEST BAL ACC | VAL ROC | VAL ACC | VAL BAL ACC # 
    # for l1_n in h1:
    #     for l2_n in h2:
    #       for n in [250, 1000, 10000]:
    #         for i in [1,2]:

    #           row = np.empty(10)
    #           row[0] = l1_n
    #           row[1] = l2_n
    #           row[2] = n
    #           row[3] = i

    #           if i == 1:
    #             ## FOR FIG1
    #             total_features1, total_labels1 = generate_data(n, -6, 6, -4, 4, [-4, 3], [-2, -1], [2, 1], 3, 96, 98) 
    #             validation_features1, validation_labels1 = generate_data(100000, -6, 6, -4, 4, [-4, 3], [-2, -1], [2, 1], 3, 100, 100)
    #             model1, scores1 = train_model(total_features1, total_labels1, l1_n, l2_n)

    #             row[4] = scores1[0]
    #             row[5] = scores1[1]
    #             row[6] = scores1[2]

    #             validation_labels_pred = predict_using_kfold_model(model1, validation_features1)
    #             validation_scores = compute_scores(validation_labels1, validation_labels_pred)

    #             row[7] = validation_scores[0]
    #             row[8] = validation_scores[1]
    #             row[9] = validation_scores[2]
    #           else:
    #             ## FOR FIG2
    #             total_features2, total_labels2 = generate_data(n, -6, 6, -4, 4, [-4, 3], [-1, -2], [2, 0], 1, 96, 98) 
    #             validation_features2, validation_labels2 = generate_data(100000, -6, 6, -4, 4, [-4, 3], [-1, -2], [2, 0], 1, 100, 100)
    #             model2, scores2 = train_model(total_features1, total_labels1, l1_n, l2_n)

    #             row[4] = scores2[0]
    #             row[5] = scores2[1]
    #             row[6] = scores2[2]

    #             validation_labels_pred = predict_using_kfold_model(model2, validation_features2)
    #             validation_scores = compute_scores(validation_labels2, validation_labels_pred)

    #             row[7] = validation_scores[0]
    #             row[8] = validation_scores[1]
    #             row[9] = validation_scores[2]
              
    #           results.append(row)

    # print(results)


if __name__ == "__main__":
    main()