# Compute performance metrics for the given Y and Y_score without sklearn

# Task-A

In [None]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
# other than these two you should not import any other packages

<pre>
<font color='red'><b>A.</b></font> Compute performance metrics for the given data <strong>5_a.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/gdrive')


Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
# reading the data for task -A
data = pd.read_csv('/gdrive/My Drive/7_Compute Performance metrics without Sklearn/5_a.csv')


In [None]:
data.head()

Unnamed: 0,y,proba
0,1.0,0.637387
1,1.0,0.635165
2,1.0,0.766586
3,1.0,0.724564
4,1.0,0.889199


In [None]:
# distinct value counts of target column
data['y'].value_counts()

1.0    10000
0.0      100
Name: y, dtype: int64

In [None]:
# How many datapoints have probability values less than 0.5?
len(data[data['proba']<0.5])

0

### Observations:
1. `proba column has zero data points less than 0.5 in data.`

In [None]:
# adding new column as "predicted" with default values
data['predicted']=1
#  Replacing predicted column value by using proba column values based on given condition.
# -- https://pythonexamples.org/pandas-dataframe-replace-values-in-column-based-on-condition/

data.loc[(data['proba']<0.5),'predicted']=0
# unique value counts after applying condition
print(data['predicted'].value_counts())

1    10100
Name: predicted, dtype: int64


### Observations:
1. `actual -y `: value_counts :- 
      
       +ve: 10000
       -ve:   100
2. `predicted` : value_counts :-

        +ve: 10100
        -ve:     0

### 1.Confusion Matrix

In [None]:
# definition of confusion matrix:
def confusionMatrix(y,predicted):
    tp=tn=fp=fn=0;          # initializing truepostive,truenegative,falsepositive,falsenegative

    # stacking columns 'y' and 'predicted' to get combinations of [ 1,1 ],[ 0,1 ],[ 1,0 ],[ 0,0 ],  like in logic gates(inputs to truth table).
    ''' Ex:
     >>>a = np.array((1,0,1,1))
     >>>b = np.array((1,1,0,1))
     >>>np.dstack((a,b))
     array([[[1, 1],
             [0, 1],
             [1, 0],
             [1, 1]]])
    '''
    # and then getting count of each type of combinations.

    # TP: if both actual and predicted is positive        [ 1,1 ]
    # FP: if actual is negative and predicted as positive [ 0,1 ]
    # FN: if actual is positive and predicted in negative [ 1,0 ]
    # TN: if both actual and predicted is negative        [ 0,0 ]

    # https://numpy.org/doc/stable/reference/generated/numpy.dstack.html 

    nd_array = np.dstack((y,predicted))          
    comb_count= np.unique(nd_array[0],axis=0,return_counts=True)  
   
    for each in range(len(comb_count[0])):
      if (list(comb_count[0][each])==[0.0,1.0]):
        fp = comb_count[1][each]
      elif (list(comb_count[0][each])==[0.0,0.0]):
        tn = comb_count[1][each]
      elif (list(comb_count[0][each])==[1.0,0.0]):
        fn = comb_count[1][each]
      elif (list(comb_count[0][each])==[1.0,1.0]):
        tp = comb_count[1][each]

    return tn,fp,fn,tp


In [None]:
# inputs for confusion matrix
y = np.array(data['y'])
predicted = np.array(data['predicted'])

# result:
tn,fp,fn,tp= confusionMatrix(y,predicted)
print(" True neg  = {} \n False pos = {} \n False neg = {} \n True pos  = {} ".format(tn, fp, fn, tp ))

 True neg  = 0 
 False pos = 100 
 False neg = 0 
 True pos  = 10000 


### 2.F1_Score:

In [None]:
# function to find f1-score:
def f1Score(y,predicted):
  tn,fp,fn,tp= confusionMatrix(y,predicted)
  pre = (tp/(tp+fp))
  rec = (tp/(tp+fn))
  return 2*((pre*rec)/(pre+rec))

In [None]:
print("F1_score = ",f1Score(y,predicted))

F1_score =  0.9950248756218906


### Observations:
1. F1-score (measure of model's accuracy) : `high` 
because precision and recall are majorly works on `true positive` and here in this dataset most of the points are positive.


### roc Curve

In [None]:
def rocCurve(y,scores):

  # lists to store tpr and fpr values for each threshold
  tpr_array = []         
  fpr_array = []      

  # getting unique probability scores in decreasing order.                 
  probabilities = sorted(np.unique(scores),reverse=True)
  # creating dataframe with y and corresponding scores
  df = pd.DataFrame({'y':y,'proba':scores})


  # for each probability as threshold, calculating its tpr,fpr to finally get area under curve.
  for each in tqdm(range(len(probabilities))):

    threshold = probabilities[each]
    # adding 'predicted' column to dataframe by default value = 1 and changing with predicting values
    # by applying each probability value as threshold and calculating its tpr and fpr

    df['predicted']=1
    df.loc[(df['proba']<threshold),'predicted']=0
    predicted = np.array(df['predicted'])
    
    # confusion Matrix result
    tn,fp,fn,tp=confusionMatrix(y,predicted)

    tpr = (tp/(tp+fn))
    fpr= (fp/(tn+fp))

    tpr_array.append(tpr)
    fpr_array.append(fpr)

  return fpr_array,tpr_array


In [None]:
# inputs for areaUnderCurve
y = np.array(data['y'])
scores = np.array(data['proba'])
# result:
fpr,tpr= rocCurve(y,scores)


HBox(children=(FloatProgress(value=0.0, max=10100.0), HTML(value='')))




### 3.Area under curve

In [None]:
# area under curve :
print("Area under curve = ",np.trapz(np.array(tpr),np.array(fpr)))

Area under curve =  0.48829900000000004


### Observations:
1. tpr is increasing slowly with increasing fpr, which give less `area under curve`
2. iniatially one row having highest threshold ( which is highest ) has  
   predicted value ==1 and rest all rows are zero's `(gives tpr closes to '0'
   (tp=1,fn=high) and fpr also closed to '0'(fp=0,tn=high))`
3. finally all are one's except the row's which are less than the last
   threshold value `(gives tpr ==1(tp=high,fn=0) and fpr ==1(fp=high,tn=0))`

### 4.Accuracy Score

In [None]:
# funtion to find accuracy score
def accuracyScore(y,predicted):
  tn,fp,fn,tp= confusionMatrix(y,predicted)
  return ((tn+tp)/(tn+tp+fn+fp))

# accuracy score
print("Accuracy score = ",accuracyScore(y,predicted))

Accuracy score =  0.9900990099009901


# Task-B

<pre>
<font color='red'><b>B.</b></font> Compute performance metrics for the given data <strong>5_b.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a></li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [None]:
# read task-b data.
data2 = pd.read_csv('/gdrive/My Drive/7_Compute Performance metrics without Sklearn/5_b.csv')

In [None]:
data2['y'].value_counts()

0.0    10000
1.0      100
Name: y, dtype: int64

In [None]:
# How many datapoints have probability values less than 0.5?
len(data2[data['proba']<0.5])

0

In [None]:
# adding new column as "predicted" with default values
data2['predicted']=1
#  Replacing predicted column value by using proba column values based on given condition.
# -- https://pythonexamples.org/pandas-dataframe-replace-values-in-column-based-on-condition/

data2.loc[(data2['proba']<0.5),'predicted']=0
# unique value counts after applying condition
print(data2['predicted'].value_counts())

0    9806
1     294
Name: predicted, dtype: int64


### Observations:
1. `actual -y `: value_counts :- 
      
       +ve:   100
       -ve: 10000
2. `predicted` : value_counts :-

        +ve:   294
        -ve:  9806

### 1. Confusion Matrix

In [None]:
# inputs for confusion matrix and rocCurve
y = np.array(data2['y'])
predicted = np.array(data2['predicted'])

# result:
tn,fp,fn,tp= confusionMatrix(y,predicted)
print(" True neg  = {} \n False pos = {} \n False neg = {} \n True pos  = {} ".format(tn, fp, fn, tp ))

 True neg  = 9761 
 False pos = 239 
 False neg = 45 
 True pos  = 55 


### 2. F1-score:

In [None]:
print("F1_score = ",f1Score(y,predicted))

F1_score =  0.2791878172588833


### Observations:
1. f1_score is `less`,
 due to precision and recall is less ( which majorly works `True positive` and here in this dataset most of the points are negative)

### 3.Auc

In [None]:
# inputs for rocCurve
y = np.array(data2['y'])
scores = np.array(data2['proba'])

In [None]:
# result:
fpr,tpr= rocCurve(y,scores)

HBox(children=(FloatProgress(value=0.0, max=10100.0), HTML(value='')))




In [None]:
# area under curve :
print("Area under curve = ",np.trapz(np.array(tpr),np.array(fpr)))

Area under curve =  0.9377570000000001


### Observations:
1. tpr is increasing very fastly with increasing fpr, which give high `area under curve`


### 4.Accuracy score

In [None]:
# accuracy score
print("Accuracy score = ",accuracyScore(y,predicted))

Accuracy score =  0.9718811881188119


### Observations:

1. accuracy score for both task-A and task-B are high, because they are extremly positive are negative ( not equally distributed.)

# Task -C

<font color='red'><b>C.</b></font> Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [None]:
# reading task-c data
data3 = pd.read_csv('/gdrive/My Drive/7_Compute Performance metrics without Sklearn/5_c.csv')

In [None]:
data3.head()

Unnamed: 0,y,prob
0,0,0.458521
1,0,0.505037
2,0,0.418652
3,0,0.412057
4,0,0.375579


In [None]:
# value counts of class
data3['y'].value_counts()

0    1805
1    1047
Name: y, dtype: int64

In [None]:
# inputs for roc_threshold
y = np.array(data3['y'])
scores = np.array(data3['prob'])

In [None]:
def roc_threshold(y,scores):

  # lists to store tpr and fpr values for each threshold
  tpr_array = []         
  fpr_array = []      
  
  # getting unique probability scores in decreasing order.                 
  probabilities = sorted(np.unique(scores),reverse=True)
  # creating dataframe with y and corresponding scores
  df = pd.DataFrame({'y':y,'proba':scores})
  A = 10000000
  best_threshold  = 0
  # for each probability as threshold, calculating its tpr,fpr to finally get area under curve.
  for each in tqdm(range(len(probabilities))):

    threshold = probabilities[each]
    df['predicted']=1
    df.loc[(df['proba']<threshold),'predicted']=0

    predicted = np.array(df['predicted'])

    tn,fp,fn,tp=confusionMatrix(y,predicted)
    # calculating tpr,fpr
    tpr = (tp/(tp+fn))
    fpr= (fp/(tn+fp))

    min=500*(fn)+100*(fp)
    if min < A:
      A = min 
      best_threshold = threshold

    tpr_array.append(tpr)
    fpr_array.append(fpr)

  return fpr_array,tpr_array,best_threshold


In [None]:
# result:
fpr,tpr,thre = roc_threshold(y,scores) 

HBox(children=(FloatProgress(value=0.0, max=2791.0), HTML(value='')))




In [None]:
print("Area under curve = ",np.trapz(np.array(tpr),np.array(fpr)))

Area under curve =  0.8288141557331724


In [None]:
print("best threshold = ",thre)

best threshold =  0.2300390278970873


### verifying with obtained threshold:

In [None]:
# adding new column as "predicted" with default values
data3['predicted']=1
#  Replacing predicted column value by using proba column values based on given condition.
# -- https://pythonexamples.org/pandas-dataframe-replace-values-in-column-based-on-condition/

data3.loc[(data2['proba']<=0.2300390278970873),'predicted']=0
# unique value counts after applying condition
print(data2['predicted'].value_counts())

0    9806
1     294
Name: predicted, dtype: int64


In [None]:
y = data3['y']
predicted = data3['predicted']
tn,fp,fn,tp= confusionMatrix(y,predicted)
print(" True neg  = {} \n False pos = {} \n False neg = {} \n True pos  = {} ".format(tn, fp, fn, tp ))

 True neg  = 565 
 False pos = 1240 
 False neg = 336 
 True pos  = 711 


### Observations:
 $ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

1. need to predict best threshold which minimizes metric 'A'
2. here, we are giving high importance to `false negative` ,even a small value of false negative value gives large value after multiplying with '500'.
3. so here we are trying to reduce false negative as much as possible values than false positive.

# Task-D

<pre>
<font color='red'><b>D.</b></font> Compute performance metrics(for regression) for the given data <strong>5_d.csv</strong>
    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [None]:
# reading task-d data
data4 = pd.read_csv('/gdrive/My Drive/7_Compute Performance metrics without Sklearn/5_d.csv')

In [None]:
data4.head()

Unnamed: 0,y,pred
0,101.0,100.0
1,120.0,100.0
2,131.0,113.0
3,164.0,125.0
4,154.0,152.0


In [None]:
predicted = data4['pred']
actual    = data4['y']

### 1.meanSquaredError

In [None]:
# definition of meanSquaredError
def meanSquaredError(actual,predicted):
  squareError = 0
  n = len(actual)
  for i in range(n):
    squareError += (actual[i]-predicted[i])**2
  return squareError/n


In [None]:
print("Mean squared Error: ",meanSquaredError(actual,predicted))

Mean squared Error:  177.16569974554707


### 2.meanAbsolutePercentageError

In [None]:
# definition of meanAbsolutePercentageError:
'''
Measure of prediction accuracy:
actual formulae: 1/n(sumOfAll(n-values)((predicted-actual)/actual)
problem with meanAbsolutePercentageError:
1. if actual == 0 ; the term is not defined.

work Around:
so change actual value to avg-of all actual values:

modified formulae :
result = sum of all the absolute error value/ sum of all actual values
'''

def meanAbsolutePercentaError(predicted,actual):
  error=0
  errorSum = 0
  actualSum = 0
  for i in range(len(predicted)):
    errorSum += abs(predicted[i]-actual[i])
    actualSum += actual[i]
  return (errorSum/actualSum)



In [None]:
print("Mean Absolute Percentage error: ",meanAbsolutePercentaError(predicted,actual))

Mean Absolute Percentage error:  0.1291202994009687


### 3.R-Squared:

In [None]:
# R - squared : 
def coefficientOfDetermination(predicted,actual):
  if (len(actual) == len(predicted)):
    residual = 0
    total    = 0
    rSquare  = 0
    N = len(actual)
    mean = np.sum(actual)/N
    for i in range(N):
      residual += (actual[i]-predicted[i])**2
      total    += (actual[i]-mean)**2
    rSquare = 1 -(residual/total)
    return rSquare
  else:
    return "give proper inputs"


In [None]:
print("R - squared ",coefficientOfDetermination(predicted,actual))

R - squared  0.9563582786990964
