# Compute performance metrics for the given Y and Y_score without sklearn

In [1]:
import numpy as np
import pandas as pd
# other than these two you should not import any other packages

<pre>
<font color='red'><b>A.</b></font> Compute performance metrics for the given data <strong>5_a.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

**Implementation of Performance metric class to solve problem A, B, C and D**

In [246]:
from tqdm import tqdm #Importing tqdm for visual representation of progress

class PerformanceMatrix:
    def __init__(self,df):
        self.setVariables()
        self.df = df
    
    def setVariables(self):
        self._truePositive = 0
        self._trueNegative = 0
        self._falsePositive = 0
        self._falseNegtive = 0
        self._cnf_matrix = []
        
    def predict(self,threshold):
        self.df["ypred"] = [1 if x > threshold else 0 for x in self.df["proba"]] 

        
    def confusion_matrix(self):
        """This method returns the confusion matrix for the given pair of Y and Y_Predicted"""       
        #y,ypred
        y = self.df["y"]
        ypred = self.df["ypred"]
        self.setVariables()
        prob = self.df["proba"]
        
        try:
            assert len(y) == len(ypred)
            for val in range(len(prob)):
                if y[val] == 1 and ypred[val] == 1:
                    self._truePositive +=1
                if y[val] == 1 and ypred[val] == 0:
                    self._trueNegative +=1
                if y[val] == 0 and ypred[val] == 1:
                    self._falsePositive +=1
                if y[val] == 0 and ypred[val] == 0:
                    self._falseNegtive +=1
            for i in self._truePositive,self._trueNegative,self._falsePositive,self._falseNegtive:
                self._cnf_matrix.append(i)
            cnfMatrix = self._cnf_matrix.copy()
                
            return np.array(cnfMatrix).reshape(2,2)
  
        except AssertionError:
            print("Input Error: Length of y and ypred is not same.")
            
    def f1_score(self):
        """This method returns the f1_score for the given pair of Y and  Y_Predicted"""  
        _precision = ((self._truePositive)/(self._truePositive+self._falsePositive))
        _recall = ((self._truePositive)/(self._falseNegtive+self._truePositive))
        f1Score = (2*((_precision*_recall)/(_precision+_recall)))
        return f1Score   
    
    def acuracy(self):
        """This method return the accuracy of a model for a given pair of Y and  Y_Predicted"""        
        totalNumberofPoints = len(self.df["y"])
        _accuracy = self._truePositive/totalNumberofPoints
        return _accuracy
    
    def auc_score(self):
        """This method returns the AUC score"""
        totalPositiveCount = self._falseNegtive + self._truePositive
        totalNegativeCount = self._falsePositive + self._trueNegative
        truePositiveRate = []
        falsePositiveRate = []
        prob = self.df["proba"]
        self.df=self.df.sort_values(by='proba',ascending=False)
        self.df.drop(columns=['ypred'],inplace=True)
        
        for item in tqdm(prob):
            self.predict(item)
            cnf = self.confusion_matrix()
            truePositiveRate.append(int(self._cnf_matrix[3])/totalPositiveCount)
            falsePositiveRate.append(int(self._cnf_matrix[2])/totalNegativeCount)
            self.df.drop(columns=["ypred"])

        return np.trapz(truePositiveRate,falsePositiveRate)
    
    def metricForLowestValues(self):
        """Compute the best threshold of probability which gives lowest values of metric A"""
        dict_metricA = {}
        prob = self.df["proba"]

        for item in tqdm(prob):
            self.predict(item)
            cnf = self.confusion_matrix()
            # A=500×number of false negative+100×numebr of false positive
            metricA = 500 * self._falseNegtive + 100* self._falsePositive
            dict_metricA[item] = metricA
            self.df.drop(columns=["ypred"],inplace=True)
        sorted_metricAList = sorted(dict_metricA.items(),key=lambda item:item[1])
        minKey = sorted_metricAList[0][0]
        minValue = dict_metricA[minKey]

        return minKey, minValue
    
    def meanSquaredError(self):
        """This module calcutes the mean square error"""
        y = self.df["y"]
        ypred = self.df["ypred"]
        return np.square(np.subtract(y,ypred)).mean()
    
    def calculateMAPE(self):
        """This method return the Modified MAPE (Mean Absolute Percentage Error)"""
        y = self.df["y"]
        ypred = self.df["ypred"]
        actual = y.sum()
        absError = np.absolute(y - ypred).sum()
        return absError/actual
         
    
    def _totalSumOfSquared(self,y,ymean):
        ssTo = np.square(y - ymean).sum()
        return ssTo
    
    def _residualSumOfSquared(self,y,ypred):
        ssRes = np.square(y - ypred).sum()
        return ssRes
    
    def R_SquaredError(self):
        "This method calculates the Coefficient of Determination-R2 score"
        ymean = self.df["y"].mean()
        y = self.df["y"]
        ypred = self.df["ypred"]
        _SSres = self._residualSumOfSquared(y,ypred)
        _SSto = self._totalSumOfSquared(y,ymean)
        _R2 = (1-(_SSres/_SSto))
        return _R2
    

**Solution for Problem 'A'**

In [216]:
df = pd.read_csv("5_a.csv")
df.shape

(10100, 2)

In [217]:
performance = PerformanceMatrix(df)
performance.predict(0.5)

In [218]:
performance.confusion_matrix()

array([[10000,     0],
       [  100,     0]])

In [219]:
performance.f1_score()

0.9950248756218906

In [220]:
performance.acuracy()

0.9900990099009901

In [8]:
performance.auc_score()

100%|██████████| 10100/10100 [2:03:17<00:00,  1.37it/s] 


0.004999999999999999

<pre>
<font color='red'><b>B.</b></font> Compute performance metrics for the given data <strong>5_b.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a></li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

**Solution for Problem 'B'**

In [211]:
# write your code
df2 = pd.read_csv("5_b.csv")
df2.shape

(10100, 2)

In [212]:
performance2 = PerformanceMatrix(df2)
performance2.predict(0.5)

In [213]:
performance2.confusion_matrix()

array([[  55,   45],
       [ 239, 9761]])

In [214]:
performance2.f1_score()

0.010880316518298714

In [215]:
performance2.acuracy()

0.005445544554455445

In [17]:
performance2.auc_score()

100%|██████████| 10100/10100 [9:46:37<00:00,  3.48s/it]      


17.93564957901443

<font color='red'><b>C.</b></font> Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

**Solution for Problem 'C'**

In [244]:
# write your code
df3=pd.read_csv('5_c.csv')
df3.columns = ["y", "proba"]
print(df3.shape)

(2852, 2)


In [245]:
performance3 = PerformanceMatrix(df3)

In [209]:
performance3.metricForLowestValues()

100%|██████████| 2852/2852 [07:12<00:00,  6.59it/s]


(0.02803798623987141, 180900)

<pre>
<font color='red'><b>D.</b></font> Compute performance metrics(for regression) for the given data <strong>5_d.csv</strong>
    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

**Solution for Problem 'D'**

In [239]:
# write your code
df4=pd.read_csv('5_d.csv')
df4.columns = ["y", "ypred"]
print(df4.shape)

(157200, 2)


In [241]:
performance4 = PerformanceMatrix(df4)
performance4.meanSquaredError()

177.16569974554707

In [242]:
performance4.calculateMAPE()

0.1291202994009687

In [243]:
performance4.R_SquaredError()

0.9563582786990937