<h1>Computing performance metrices for the given data in different scenarios, without using sklearn library</h1>

<h3>1. Imbalanced dataset (number of positive points >> number of negative points)</h3>

In [50]:
# imports 
import numpy as np 
import pandas as pd
from collections import Counter
from tqdm import tqdm

In [51]:
# reading the csv file
data = pd.read_csv('E:/GITHUB REPOS/performance-metrics-from-scratch/datasets/sample1.csv')
print(data.head(5))

     y     proba
0  1.0  0.637387
1  1.0  0.635165
2  1.0  0.766586
3  1.0  0.724564
4  1.0  0.889199


In [52]:
# printing number of positive(denoted by '1') and negative(denoted by '0') data points
print(Counter(data['y']))

Counter({1.0: 10000, 0.0: 100})


<h4>Function to predict class label of the dataset on the basis of threshold value <br> (If threshold value < 0.5, then treat the label as '0' else treat the label as '1') </h4>

In [53]:
def predict(data, probability, threshold):
  # empty list to store predicted class labels
  y_hat = []

  for prob_value in data[probability]:

     if prob_value < threshold:
       y_hat.append(0)
     else:
       y_hat.append(1)

  return y_hat

<h4><font color='red'>Confusion Matrix</font></h4>

<h4>Function to create confusion matrix (confusion matrix has tp,fp,tn,fn)<br>
where tp is true positive, fp is false positive, tn is true negative, fn is false negative</h4>

In [54]:
def confusion_matrix(data, y, y_hat):
  # initialize the values of the confusion matrix to zero
  tp = 0
  fp = 0
  tn = 0
  fn = 0

  for index,label in enumerate(data['y']):
    
    # if predicted class value is 1 and actual class value is 1, then its true positive
    if(data.y_hat[index]==1) and data.y[index]==1:  
      tp += 1

    # if predicted class value is 1 and actual class value is 0, then its false positive
    if(data.y_hat[index]==1) and data.y[index]==0:  
      fp += 1

    # if predicted class value is 0 and actual class value is 0, then its true negative
    if(data.y_hat[index]==0) and data.y[index]==0:  
      tn += 1

    # if predicted class value is 0 and actual class value is 1, then its false negative
    if(data.y_hat[index]==0) and data.y[index]==1:  
      fn += 1

  return {'tp':tp, 'fp':fp, 'tn':tn, 'fn':fn}

In [55]:
# threshold set to 0.5 (changes according the problem/business need)
threshold = 0.5

# calling the predict function and adding a y_hat column to the data
data['y_hat'] = predict(data, 'proba', threshold)

# updated dataset with y_hat values as well
print(data[:5])

# calling the confusion matrx function
conf_matrix = confusion_matrix(data, 'y', 'y_hat')  

# printing the confusion matrix
print('\nThe Confusion Matrix is - ', conf_matrix) 

     y     proba  y_hat
0  1.0  0.637387      1
1  1.0  0.635165      1
2  1.0  0.766586      1
3  1.0  0.724564      1
4  1.0  0.889199      1

The Confusion Matrix is -  {'tp': 10000, 'fp': 100, 'tn': 0, 'fn': 0}


<h4><font color='red'>F1 Score</font></h4>

<h4>For F1 score calculations, first we need to compute precision and recall values</h4>

In [56]:
precision = conf_matrix['tp']/(conf_matrix['tp'] + conf_matrix['fp'])
recall = conf_matrix['tp']/(conf_matrix['tp'] + conf_matrix['fn'])

# F1 score formula
F1_score = 2*precision*recall/(precision+recall)
print('F1 score is - ', F1_score)

F1 score is -  0.9950248756218906


<h4><font color='red'>Accuracy</font></h4>

<h4>Accuracy in terms of confusion matrix values is =><br>Total number of correct predictions(positive and negative)/Total number of data points in the dataset</h4>

In [57]:
accuracy = (conf_matrix['tp']+conf_matrix['tn'])/data.shape[0]
print('The accuracy is - ', accuracy)

The accuracy is -  0.9900990099009901


<h4><font color='red'>AUC Score</font></h4>

<h4>Function to calculate AUC Score</h4>

In [61]:
def auc_score(data):
  # empty lists to store true positive rate values and false positive rate values
  tpr, fpr = [], []
  
  for i in tqdm(data['proba']):

    data['y_hat'] = predict(data, 'proba', i)
    conf_matrix = confusion_matrix(data, 'y', 'y_hat') 

    # computing tpr and fpr
    tpr.append(conf_matrix['tp']/(conf_matrix['tp']+conf_matrix['fn']))
    fpr.append(conf_matrix['fp']/(conf_matrix['tn']+conf_matrix['fp']))

    data.drop(columns=['y_hat'])

  # calculates the AUC by using trapezium rule
  return np.trapz(tpr, fpr)

In [62]:
# sorting the given probability values in descending order
data = data.sort_values(by='proba', ascending=False)
data = data.drop(columns=['y_hat'])

In [63]:
# calling the auc function
AUCscore = auc_score(data)

# printing the AUC score
print('The AUC Score is - ', AUCscore)

100%|██████████| 10100/10100 [1:49:44<00:00,  1.53it/s] 

The AUC Score is -  0.48829900000000004





<h3>2. Imbalanced dataset (number of positive points << number of negative points)</h3>

<h4>There is no need to write the predict() and confusion_matrix() functions again. 
We will just replace the data2 in the place of data.</h4>

In [64]:
# reading the csv file
data2 = pd.read_csv('E:/GITHUB REPOS/performance-metrics-from-scratch/datasets/sample2.csv')
print(data2.head(5))

     y     proba
0  0.0  0.281035
1  0.0  0.465152
2  0.0  0.352793
3  0.0  0.157818
4  0.0  0.276648


In [65]:
# printing number of positive(denoted by '1') and negative(denoted by '0') data points
print(Counter(data2['y']))

Counter({0.0: 10000, 1.0: 100})


<h4><font color='red'>Confusion Matrix</font></h4>

In [66]:
threshold = 0.5
data2['y_hat'] = predict(data2, 'proba', threshold)
conf_matrix_2 = confusion_matrix(data2, 'y', 'y_hat')

# printing confusion matrix of data2
print('The confusion matrix is - ', conf_matrix_2)

The confusion matrix is -  {'tp': 55, 'fp': 239, 'tn': 9761, 'fn': 45}


<h4><font color='red'>F1 Score</font></h4>

In [67]:
precision_2 = conf_matrix_2['tp']/(conf_matrix_2['tp'] + conf_matrix_2['fp'])
recall_2 = conf_matrix_2['tp']/(conf_matrix_2['tp'] + conf_matrix_2['fn'])

F1_score_2 = 2*precision_2*recall_2/(precision_2+recall_2)
print('F1 score is - ', F1_score_2)

F1 score is -  0.2791878172588833


<h4><font color='red'>Accuracy</font></h4>

In [68]:
accuracy_2 = (conf_matrix_2['tp']+conf_matrix_2['tn'])/data2.shape[0]

print('The accuracy is - ', accuracy_2)

The accuracy is -  0.9718811881188119


<h4><font color='red'>AUC Score</font></h4>

In [69]:
data2 = data2.sort_values(by='proba', ascending=False)
data2 = data2.drop(columns=['y_hat'])

AUCscore_2 = auc_score(data2)

print('The AUC Score is - ', AUCscore_2)

100%|██████████| 10100/10100 [1:55:29<00:00,  1.46it/s] 

The AUC Score is -  0.9377570000000001





<h3>3. Calculating MSE, MAPE and R<sup>2</sup> error for a regression problem<br>MSE -> Mean Squared Error<br>MAPE -> Mean Absolute Percentage Error<br>R<sup>2</sup>-> R Squared Error</h3>

In [37]:
# reading the csv file
data3 = pd.read_csv('E:/GITHUB REPOS/performance-metrics-from-scratch/datasets/sample3.csv')
print(data3.head(5))

       y   pred
0  101.0  100.0
1  120.0  100.0
2  131.0  113.0
3  164.0  125.0
4  154.0  152.0


<h4>Defining a function to calculate error between the actual label (y) and predicted label (pred)</h4>

In [38]:
def error(data3, y, pred):
  error_value = []

  for idx, (y_value, pred_value) in enumerate(zip(data3[y], data3[pred])):
    error_value.append(y_value - pred_value)
  
  return error_value

In [39]:
# calling the error function and assigning it to newly created column of the dataset which is 'error' column
data3['error'] = error(data3, 'y', 'pred')

<h4><font color='red'>Mean Squared Error</font></h4>

In [41]:
# defining function to calculate mean square error
# using already computed error values as an argument in the function so that we dont need to calculate eᵢ again

def mean_square_error(data3, error_col):
  # initializing square error to zero
  square_error = 0  
  
  for error_value in data3[error_col]:
    # incrementing squared error
    square_error += (error_value*error_value)

  # mean calculated of squared error and returning MSE
  return square_error/len(data3[error_col])  

In [42]:
MSE = mean_square_error(data3, 'error')

# printing mean squared error
print('The Mean Squared Error is - ', MSE)  

The Mean Squared Error is -  177.16569974554707


<h4><font color='red'>Mean Absolute Percentage Error</font></h4>

In [43]:
# defining function to calculate absolute error
def absolute_error(data3, error_col):
  absolute_error_value = []

  for error_value in data3[error_col]:
    # storing the absolute values of eᵢ in list created
    absolute_error_value.append(abs(error_value))
  
  return absolute_error_value


# defining a function to calculate MAPE(Mean Absolute Percentage Error)
def mape(data3, abs_error, y):
  mape_value = sum(data3[abs_error])/sum(data3[y]) 
  return mape_value

In [45]:
# calling the absolute_error_function and assigning it to newly created column of the dataset which is 'absolute_error' column
data3['absolute_error'] = absolute_error(data3, 'error')

In [46]:
MAPE = mape(data3, 'absolute_error', 'y')

# printing mean absolute percentage error
print('The Mean Absolute Percentage Error is - ', MAPE)

The Mean Absolute Percentage Error is -  0.1291202994009687


<h4><font color='red'>R Squared Error</font></h4>

In [47]:
# defining a function to calculate SSᵣₑₛ(residual sum of squares)
def residual_ss(data3, error_col):
  
  # initializing the residual error to zero
  residual = 0

  for error_value in data3[error_col]:
    # incrementing the residual error
    residual += (error_value*error_value)  
  
  return residual


# defining a function to calculate SSₜₒₜₐₗ(total sum of squares)
def total_ss(data3, y_col): 
  
  # initializing the total error to zero
  total = 0  
  # mean calculation of actual values
  mean_y = data3['y'].mean()  
  
  for actual_value in data3[y_col]:
    # incrementing the total error
    total += (actual_value-mean_y)*(actual_value-mean_y)  
  
  return total

In [48]:
SS_residual = residual_ss(data3, 'error')
SS_total = total_ss(data3, 'y')

# formula of R² error
R_squared_error = 1 - (SS_residual/SS_total)  

# printing the R² error
print('R² error is - ', R_squared_error)  

R² error is -  0.9563582786990964
