# Compute performance metrics for the given Y and Y_score without sklearn

In [1]:
import numpy as np
import pandas as pd
# other than these two you should not import any other packages

<pre>
<font color='red'><b>A.</b></font> Compute performance metrics for the given data <strong>5_a.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [2]:
# reading the data set and having a look at top rows
data = pd.read_csv('5_a.csv')
data.head()

Unnamed: 0,y,proba
0,1.0,0.637387
1,1.0,0.635165
2,1.0,0.766586
3,1.0,0.724564
4,1.0,0.889199


In [3]:
# create y_pred column, which satitfies the problem statement
data['y_pred']= np.where(data['proba']<0.5, 0,1)

In [4]:
data.head()

Unnamed: 0,y,proba,y_pred
0,1.0,0.637387,1
1,1.0,0.635165,1
2,1.0,0.766586,1
3,1.0,0.724564,1
4,1.0,0.889199,1


In [10]:
# cross checking the count of unique values in the predicted label
data.groupby('y_pred').count()

Unnamed: 0_level_0,y,proba
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1
1,10100,10100


In [18]:
# calculating TP, TN, FP, FN for constructing confusion matrix
TP = data.y_pred[(data.y==1) & (data.y_pred==1)].count()
TN = data.y_pred[(data.y==0) & (data.y_pred==0)].count()
FP = data.y_pred[(data.y==0) & (data.y_pred==1)].count()
FN = data.y_pred[(data.y==1) & (data.y_pred==0)].count()

conf_matrix = np.array([[TN, FN], [FP, TP]])

In [19]:
conf_matrix

array([[    0,     0],
       [  100, 10000]])

In [20]:
# calculate precision and recall
precision = TP/(TP+FP)
recall = TP/(TP+FN)

print(precision, recall)

0.9900990099009901 1.0


In [21]:
# calculate F1 score by using precision and recall
F1_Score = 2*((precision*recall)/(precision+recall))
F1_Score

0.9950248756218906

In [28]:
# calculating tpr list and fpr list for for all the threshold values
tpr = []
fpr = []

for i in data.proba:
    y_pred = []
    for j in data.proba:
        y_pred.append(0 if j<i else 1) 
        
    data['y_pred'] = np.array(y_pred)
      
    TP1 = data.y_pred[(data.y==1) & (data.y_pred==1)].count()
    TN1 = data.y_pred[(data.y==0) & (data.y_pred==0)].count()
    FP1 = data.y_pred[(data.y==0) & (data.y_pred==1)].count()
    FN1 = data.y_pred[(data.y==1) & (data.y_pred==0)].count()
    
    tpr.append(TP1/(TP1+FN1))
    fpr.append(FP1/(TN1+FP1)) 

In [34]:
tpr = sorted(tpr)
fpr = sorted(fpr)

In [35]:
# calculating AUC value
AUC = np.trapz(np.array(tpr),np.array(fpr))
print(AUC)

0.48829900000000004


In [36]:
# calculating accuracy using confusion matrix metrices.
Accuracy_Score = (TP+TN)/(TP+TN+FP+FN)
print(Accuracy_Score)

0.9900990099009901


<pre>
<font color='red'><b>B.</b></font> Compute performance metrics for the given data <strong>5_b.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a></li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [37]:
# reading the data set and having a look at top rows
data = pd.read_csv('5_b.csv')
data.head()

Unnamed: 0,y,proba
0,0.0,0.281035
1,0.0,0.465152
2,0.0,0.352793
3,0.0,0.157818
4,0.0,0.276648


In [38]:
# create y_pred column, which satitfies the problem statement
data['y_pred']= np.where(data['proba']<0.5, 0,1)

In [39]:
# cross checking the count of unique values in the predicted label
data.groupby('y_pred').count()

Unnamed: 0_level_0,y,proba
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9806,9806
1,294,294


In [40]:
# calculating TP, TN, FP, FN for constructing confusion matrix
TP = data.y_pred[(data.y==1) & (data.y_pred==1)].count()
TN = data.y_pred[(data.y==0) & (data.y_pred==0)].count()
FP = data.y_pred[(data.y==0) & (data.y_pred==1)].count()
FN = data.y_pred[(data.y==1) & (data.y_pred==0)].count()

conf_matrix = np.array([[TN, FN], [FP, TP]])
print(conf_matrix)

[[9761   45]
 [ 239   55]]


In [41]:
# calculate precision and recall
precision = TP/(TP+FP)
recall = TP/(TP+FN)

print(precision , recall)

0.1870748299319728 0.55


In [42]:
# calculate F1 score by using precision and recall
F1_Score = 2*((precision*recall)/(precision+recall))
F1_Score

0.2791878172588833

In [43]:
# calculating tpr list and fpr list for for all the threshold values
tpr = []
fpr = []

for i in data.proba:
    y_pred = []
    for j in data.proba:
        if j<i:
            y_pred.append(0)
        else:
            y_pred.append(1)
    data['y_pred'] = np.array(y_pred)
    
    TP1 = data.y_pred[(data.y==1) & (data.y_pred==1)].count()
    TN1 = data.y_pred[(data.y==0) & (data.y_pred==0)].count()
    FP1 = data.y_pred[(data.y==0) & (data.y_pred==1)].count()
    FN1 = data.y_pred[(data.y==1) & (data.y_pred==0)].count()
    
    tpr.append(TP1/(TP1+FN1))
    fpr.append(FP1/(TN1+FP1)) 

In [45]:
tpr = sorted(tpr)
fpr = sorted(fpr)

In [46]:
# calculating AUC value
AUC = np.trapz(tpr,fpr)
print(AUC)

0.9377570000000001


In [47]:
# calculating accuracy using confusion matrix metrices.
Accuracy_Score = (TP+TN)/(TP+TN+FP+FN)
print(Accuracy_Score)

0.9718811881188119


<font color='red'><b>C.</b></font> Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [48]:
data = pd.read_csv('5_c.csv')
data.head()

Unnamed: 0,y,prob
0,0,0.458521
1,0,0.505037
2,0,0.418652
3,0,0.412057
4,0,0.375579


In [49]:
A = {}
for i in data.prob:
    y_pred=[]
    for j in data.prob:
        y_pred.append(0 if j<i else 1)
    data['y_pred']= np.array(y_pred)   
    FN = data.y_pred[(data.y==1) & (data.y_pred==0)].count()
    FP = data.y_pred[(data.y == 0) & (data.y_pred == 1)].count()
    A[i] = 500*FN + 100*FP
    

In [50]:
for key, val in A.items():
    if val==min(A.values()):
        print(key)

0.2300390278970873


<pre>
<font color='red'><b>D.</b></font> Compute performance metrics(for regression) for the given data <strong>5_d.csv</strong>
    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [3]:
data = pd.read_csv('5_d.csv')
data.head()

Unnamed: 0,y,pred
0,101.0,100.0
1,120.0,100.0
2,131.0,113.0
3,164.0,125.0
4,154.0,152.0


In [5]:
# Mean Squared Error 
MSE = np.square(data.y - data.pred).mean()
MSE

177.16569974554707

In [6]:
# MAPE
data.y[data.y==0].count()

5717

* As there are zeros in the feature 'y', we should go with modified MAPE

In [8]:
error = np.sum(np.absolute(data.y - data.pred))
denom = np.sum(np.absolute(data.y))

In [9]:
MAPE = error/denom
MAPE

0.1291202994009687

In [10]:
#R-squared
ss_res = np.sum(np.square(data.y - data.pred))
ss_total = np.sum(np.square(data.y - (data.y).mean()))

In [11]:
R_squared = 1-(ss_res/ss_total)
R_squared

0.9563582786990937