# Compute performance metrics for the given Y and Y_score without sklearn

In [1]:
import numpy as np
import pandas as pd
# other than these two you should not import any other packages


## A. Compute performance metrics for the given data '5_a.csv'
 <pre>  <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)
Note- Make sure that you arrange your probability scores in descending order while calculating AUC</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [3]:

df_a=pd.read_csv("D:/Applied AI/Assignments/Assignment 5/5_a.csv")


In [4]:
df_a['y_pred']=df_a['proba'].apply(lambda x: 1 if x>=.5 else 0)


In [4]:
def confusion_matrix(data):
    count_tn=len(data[(data['y']==0) & (data['y_pred']==0)])
    count_tp=len(data[(data['y']==1) & (data['y_pred']==1)]) #calculating tn,tp,fn,fp
    count_fn=len(data[(data['y']==1) & (data['y_pred']==0)])
    count_fp=len(data[(data['y']==0) & (data['y_pred']==1)])
    return count_fn,count_fp,count_tn,count_tp

In [5]:
def f1_score(data):
    fn,fp,tn,tp=confusion_matrix(data)
    precision=tp/(tp+fp)                    # calculating precision and recall
    recall=tp/(tp+fn)                    
    f1=2*((precision*recall)/(precision+recall))
    return f1

In [6]:
def accuracy(data):
    fn,fp,tn,tp=confusion_matrix(data)
    acc=((tp+tn)/(tp+fp+fn+tn))          
    return acc

In [7]:
def auc_score(data):
    tpr_array=[]
    fpr_array=[]
    sort= data.sort_values("proba",ascending=False) # sort sart based on probability scores
    for i in range(0,len(sort)):
        sort['y_pred']=np.where(sort['proba']>=sort.iloc[i]['proba'],1,0) # predicting the y based on each threshold
        FN,FP,TN,TP=confusion_matrix(sort)    # for each threshold calculating confusion matrix
        fpr_rate=FP/(TN+FP)
        tpr_rate=TP/(TP+FN)
        tpr_array.append(tpr_rate)
        fpr_array.append(fpr_rate)
    c=np.trapz(tpr_array, fpr_array)
    return c

In [8]:
FN,FP,TN,TP=confusion_matrix(df_a)
print("FALSE NEGATIVE :",FN)
print("FALSE POSITIVE :",FP)
print("TRUE NEGATIVE :",TN)
print("TRUE POSITIVE :",TP )

FALSE NEGATIVE : 0
FALSE POSITIVE : 100
TRUE NEGATIVE : 0
TRUE POSITIVE : 10000


In [9]:
f1=f1_score(df_a)
print("F1 SCORE :",f1)

F1 SCORE : 0.9950248756218906


In [10]:
acc=accuracy(df_a)
print('ACCURACY VALUE :',acc)

ACCURACY VALUE : 0.9900990099009901


In [11]:
auc=auc_score(df_a)
print('AUC VALUE :',auc)

AUC VALUE : 0.48829900000000004




## B. Compute performance metrics for the given data '5_b.csv'
<pre>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a>
Note- Make sure that you arrange your probability scores in descending order while calculating AUC</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [9]:
df_b=pd.read_csv("D:/Applied AI/Assignments/Assignment 5/5_b.csv")



In [10]:
df_b['y_pred']=df_b['proba'].apply(lambda x: 1 if x>=.5 else 0)

In [12]:
FN,FP,TN,TP=confusion_matrix(df_b)
print("FALSE NEGATIVE :",FN)
print("FALSE POSITIVE :",FP)
print("TRUE NEGATIVE :",TN)
print("TRUE POSITIVE :",TP )

FALSE NEGATIVE : 45
FALSE POSITIVE : 239
TRUE NEGATIVE : 9761
TRUE POSITIVE : 55


In [13]:
f1=f1_score(df_b)
print("F1 SCORE :",f1)

F1 SCORE : 0.2791878172588833


In [14]:
acc=accuracy(df_b)
print('ACCURACY VALUE :',acc)

ACCURACY VALUE : 0.9718811881188119


In [15]:
auc=auc_score(df_b)
print("AUC VALUE :",auc)

AUC VALUE : 0.9377570000000001


### C. Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data 
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [5]:
df_c=pd.read_csv("D:/Applied AI/Assignments/Assignment 5/5_c.csv")

In [6]:
def best_threshold(data):
    check=0
    thresh=[]
    A=[]
    sorted= data.sort_values("prob",ascending=False) # sorting data based on probability
    for i in range(0,len(sorted)):
        if check==(sorted.iloc[i]['prob']): # checking unique probability
            continue
        check=sorted.iloc[i]['prob'] 
        thresh.append(check)
        sorted['y_pred']=np.where(sorted['prob']>=sorted.iloc[i]['prob'],1,0)
        FN,FP,TN,TP=confusion_matrix(sorted) # calculating confusion matrix for each threshold
        value=500*FN+100*FP
        A.append(value)  # calculating the metric A
    index=A.index(min(A)) # finding the index of A with minimium value
    return thresh[index]

In [17]:
best=best_threshold(df_c)
print('BEST THRESHOLD VALUE :',best)

BEST THRESHOLD VALUE : 0.2300390278970873



## D.</b></font> Compute performance metrics(for regression) for the given data 5_d.csv
<pre>    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [7]:
df_d=pd.read_csv("D:/Applied AI/Assignments/Assignment 5/5_d.csv")

In [18]:
def regression_metrics(data):
    n=len(data)
    data['ei']= data.apply(lambda x: abs(x['y'] - x['pred']), axis=1) # calculating absolute differnce between Y and y^
    data['mse']= data['ei'].apply(lambda x: x*x) # calculating the squares of ei
    total=data['mse'].sum()
    mse=total/n
    mape=(data['ei'].sum())/(data['y'].sum())
    mean=(data['y'].sum())/n # calculating simple mean of yi's
    ssres=data['mse'].sum()
    data['sstotal']= data.apply(lambda x: (x['y'] - mean), axis=1)
    data['sstotal']= data['sstotal'].apply(lambda x: x*x)
    sstotal=data['sstotal'].sum()
    rsquared=1-(ssres/sstotal)
    return mse,mape,rsquared

In [19]:
mse,mape,rsquared=regression_metrics(df_d)
print('MEAN SQUARED ERROR :',mse)
print('MEAN ABSOLUTE PERCENTAGE ERROR :',mape*100)
print('R SQUARED :',rsquared)

MEAN SQUARED ERROR : 177.16569974554707
MEAN ABSOLUTE PERCENTAGE ERROR : 12.91202994009687
R SQUARED : 0.9563582786990937
