# Compute performance metrics for the given Y and Y_score without sklearn

In [None]:
import numpy as np
import pandas as pd
# other than these two you should not import any other packages

<pre>
<font color='red'><b>A.</b></font> Compute performance metrics for the given data <strong>5_a.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [None]:
# Function definitions required for A and B

#--------------------------------------------------------------------------------------------------------
#calculate predicted y by using probabilty value
def predicted_y(data,threshold=0.5):
  pred_y = data['proba']
  pred_y = np.where(pred_y >= threshold, 1, 0)
  return pred_y

#--------------------------------------------------------------------------------------------------------
# 1. Confusion Matrix
# here we will calculate 4 terms : TP , TN, FP, FN

# TP = No of points of positive class(1) which are predicted correctly as class 1
# TN = No of points of negative class(0) which are predicted correctly as class 0
# FP = No of points of negative class(0) which are predicted incorrectly as class 1
# FN = No of points of positive class(1) which are predicted incorrectly as class 0

def Confusion_Matrix(data):
  
  TP_pts = np.where((data['y'] == data['pred_y']) & (data['y'] == 1))
  TP=len(TP_pts[0])

  TN_pts = np.where((data['y'] == data['pred_y']) & (data['y'] == 0))
  TN=len(TN_pts[0])

  FP_pts = np.where((data['y'] != data['pred_y']) & (data['y'] == 0))
  FP=len(FP_pts[0])

  FN_pts = np.where((data['y'] != data['pred_y']) & (data['y'] == 1))
  FN=len(FN_pts[0])

  return TP,TN,FP,FN

#--------------------------------------------------------------------------------------------------------
# 2. F1 SCORE
#Here we will calculate precision and recall to calculate f1 score
#precision = TP/(TP+FP) and recall = TP/(TP+TN)

def Get_F1Score(TP,TN,FP,FN):
  precision = TP/(TP+FP)
  recall = TP/(TP+FN)
  F1_score = (2 *precision * recall)/(precision + recall) 
  return precision,recall,F1_score

#--------------------------------------------------------------------------------------------------------
# 3. AUC Score
def ROC_AUC(data):

  sorted_data = data.sort_values(by=['proba'],ascending=False)
  threshold = list(set(sorted_data['proba']))

  TPR_list = []
  FPR_list = []

  for itr in range(len(threshold)):
    
    new_y = predicted_y(data,threshold[itr])

    data = data.assign(pred_y=new_y)
    TP,TN,FP,FN = Confusion_Matrix(data)

    TPR = TP/(TP+FN)
    FPR = FP/(FP+TN)

    TPR_list.append(TPR)
    FPR_list.append(FPR)

  FPR_list = sorted(FPR_list,reverse=False)
  TPR_list = sorted(TPR_list,reverse=False)

  AUC = np.trapz(TPR_list, FPR_list)
  return AUC,FPR_list,TPR_list

#--------------------------------------------------------------------------------------------------------
# 4. Accuracy
def Accuracy(TP,TN,FP,FN):
  acc = (TP+TN)/(TP+TN+FP+FN)
  return acc



In [None]:
#A -

data_a = pd.read_csv("5_a.csv")

#predictiong y by its p_scores
pred_y = predicted_y(data_a)
data_a['pred_y'] = pred_y

#1. Confusion Matrix
TP,TN,FP,FN = Confusion_Matrix(data_a)
print("TP : ",TP)
print("TN : ",TN)
print("FP : ",FP)
print("FN : ",FN)

#2. F1 Score
precision,recall,F1_score = Get_F1Score(TP,TN,FP,FN)
print("\nPrecision : ",precision)
print("Recall : ",recall)
print("F1 Score : ",F1_score)

#3. AUC
AUC,FPR,TPR = ROC_AUC(data_a)
print("\nAUC : ",AUC)

#4. Accuracy
print("\nAccuracy : ",Accuracy(TP,TN,FP,FN))

TP :  10000
TN :  0
FP :  100
FN :  0

Precision :  0.9900990099009901
Recall :  1.0
F1 Score :  0.9950248756218906

AUC :  0.48829900000000004

Accuracy :  0.9900990099009901


<pre>
<font color='red'><b>B.</b></font> Compute performance metrics for the given data <strong>5_b.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a></li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [None]:
#B -

data_b = pd.read_csv("5_b.csv")

#predictiong y by its p_scores
pred_y = predicted_y(data_b)
data_b['pred_y'] = pred_y

#1. Confusion Matrix
TP,TN,FP,FN = Confusion_Matrix(data_b)
print("TP : ",TP)
print("TN : ",TN)
print("FP : ",FP)
print("FN : ",FN)

#2. F1 Score
precision,recall,F1_score = Get_F1Score(TP,TN,FP,FN)
print("\nPrecision : ",precision)
print("Recall : ",recall)
print("F1 Score : ",F1_score)

#3. AUC
AUC,FPR,TPR = ROC_AUC(data_b)
print("\nAUC : ",AUC)

#4. Accuracy
print("\nAccuracy : ",Accuracy(TP,TN,FP,FN))

TP :  55
TN :  9761
FP :  239
FN :  45

Precision :  0.1870748299319728
Recall :  0.55
F1 Score :  0.2791878172588833

AUC :  0.9377570000000001

Accuracy :  0.9718811881188119


<font color='red'><b>C.</b></font> Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [None]:
#C -

data_c = pd.read_csv("5_c.csv")

#could not use already defined function due to difference in column names in files - proba and prob
def predicted_y(data,threshold=0.5):
  pred_y = data['prob']
  pred_y = np.where(pred_y >= threshold, 1, 0)
  return pred_y

#get the best threshold value from all the unique thresholds
def get_Threshold(data):
  
  sorted_data = data.sort_values(by=['prob'],ascending=False)
  threshold = list(set(sorted_data['prob']))

  A = {}

  for itr in range(len(threshold)):
    new_y = predicted_y(data,threshold[itr])

    data['pred_y'] = new_y
    TP,TN,FP,FN = Confusion_Matrix(data)
    
    A_VAL = (500 * FN) + (100 * FP)
    A[threshold[itr]]=A_VAL

  #getting min of A and corresponding threshold value  
  Best_Threshold =  [key for key in A if all(A[temp] >= A[key] for temp in A)]

  return Best_Threshold[0]


#get best threashold
Best_Threshold = get_Threshold(data_c)
print("Best Threshold : ",Best_Threshold)

#predict y by using that best threshold
pred_y = predicted_y(data_c,Best_Threshold)
data_c['pred_y'] = pred_y

#calc perfomance metrics for the new predicted values
TP,TN,FP,FN = Confusion_Matrix(data_c)
print("TP : ",TP)
print("TN : ",TN)
print("FP : ",FP)
print("FN : ",FN)

precision,recall,F1_score = Get_F1Score(TP,TN,FP,FN)
print("\nPrecision : ",precision)
print("Recall : ",recall)
print("F1 Score : ",F1_score)

print("Accuracy : ",Accuracy(TP,TN,FP,FN))

Best Threshold :  0.2300390278970873
TP :  969
TN :  785
FP :  1020
FN :  78

Precision :  0.48717948717948717
Recall :  0.9255014326647565
F1 Score :  0.6383399209486166
Accuracy :  0.615007012622721


<pre>
<font color='red'><b>D.</b></font> Compute performance metrics(for regression) for the given data <strong>5_d.csv</strong>
    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [None]:
#D -


# D - 1. MEAN SQUARE ERROR
data_d = pd.read_csv("5_d.csv")
def mean_sqaure_error(data):
  summ = 0
  for i in range(len(data)):
    error = data['y'][i]-data['pred'][i]
    err_sq = error*error
    summ += err_sq
  
  res = summ/len(data)
  return res

mse = mean_sqaure_error(data_d)
print("Mean Sqaure Error : ",mse)


Mean Sqaure Error :  177.16569974554707


In [None]:
# D - 2. MEAN ABSOLUTE PERCENTAGE ERROR
def get_MAPE(data):
  err_sum = 0
  actuals = 0

  for i in range(len(data)):
    err_sum += abs(data['y'][i]-data['pred'][i])
    actuals += data['y'][i]
  
  MAPE = err_sum/actuals
  return MAPE*100

MAPE = get_MAPE(data_d)
print("MAPE : ",MAPE)

MAPE :  12.91202994009687


In [None]:
# D - 3. R^2 ERROR
def R_sq_error(data):
  #step 1 - calc mean of y_actuals
  y_mean = np.mean(data['y'])
  print("Mean of Y: ",y_mean)

  #step 2 - calc SStotal
  SS_t = 0
  for i in range(len(data)):
    SS_t += ((data['y'][i]-y_mean) * (data['y'][i]-y_mean))
  print("SStotal: ",SS_t)

  #step 3 - calc SSres
  SS_r = 0
  for i in range(len(data)):
    err = (data['y'][i]-data['pred'][i])
    SS_r += err*err
  print("SSres : ",SS_r)

  #step 4 - cal R^2 error
  R_sq = 1 - (SS_r/SS_t)
  print("R Sqaure Error: ",R_sq)

R_sq_error(data_d)
  

Mean of Y:  66.56208651399491
SStotal:  638161080.035662
SSres :  27850448.0
R Sqaure Error:  0.9563582786990964
