## **Notebook Contents**
- Import Libraries
- Import Dataframes
- Uniform Labeling
- Model Evaluation Metrics


## **Import Libraries**

In [1]:
# Imports
import numpy as np
import pandas as pd

## **Import Dataframe**

In [2]:
# On new Tweets from pred_verify.csv
data = pd.read_csv('../data/pred_verify.csv')

In [3]:
data.head()

Unnamed: 0,user,is_retweet,tweet,location,state,state_1,predict_lr,predict_nb,predict_rnn,predict_rf,labels,verify_labels,correct_preds,predict_lr.1,predict_nb.1,rnn_and_labels is 1,predict_rf.1,extra,wo_rf
0,eafreem,False,i have no institutional power to be clear but ...,she/her,,,1.0,1.0,1.0,0.933333,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,ButtaeflyTear,False,kthivz yeshctrl no way i have the power of bor...,,,,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0
2,Guillermohno,False,i just think its funny how tyler thecreator ne...,San Antonio,TX,,1.0,1.0,1.0,0.7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,js100radio,False,online gt gt https t co mixlrwax mea smart lif...,Bangkok,,,0.0,0.0,1.0,0.796746,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
4,KeysEnergy,False,approximately customers remain without power i...,Key West,FL,,1.0,1.0,0.0,0.966667,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


## **Uniform Labeling**

In [4]:
# Change random forest floats into 0's and 1's
ones_zeroes = []
for row in data['predict_rf']:
    if row > .50:
        ones_zeroes.append(1)
    else:
        ones_zeroes.append(0)

data['rf_10'] = ones_zeroes

## **Model Evaluation Metrics**

In [5]:
def acc(df, col):
    # Verified labels & labeled by script
    TP = []
    FP = []
    FN = []
    TN = []
    for index, row in df.iterrows():
        if row[col] and row['verify_labels'] == 1.0:
            TP.append(1)
        elif row[col] == 1.0 and row['verify_labels'] == 0.0:
            FP.append(1)
        elif row[col] == 0.0 and row['verify_labels'] == 1.0:
            FN.append(1)
        else:
            TN.append(1)
            
    TP = np.sum(TP)
    FP = np.sum(FP)
    FN = np.sum(FN)
    TN = np.sum(TN)
        
    # Accuracy:  # What percentage of observation did I correctly predict?
    print(f"The Accuracy score is: {round((TP + TN) / (TP + TN + FP + FN), 4) * 100}%")
    # Misclassification: # What percentage of observation did I incorrectly predict?
    print(f"The Missclassification rate is: {round((FP + FN) / (TP + TN + FP + FN), 3) * 100}%")
    # Sensitivity:  # Among Positives, how many did I get correctly? Same as recall
    print(f"The Sensitivity is: {round(TP / (TP + FN), 4) * 100}%")
    # Specificity:  # Among Negatives, how many did I get correctly?
    print(f"The Specificity is: {round(TN / (TN + FP), 4) * 100}%")
    # Precision: # Among Positives, how many did i Predict correct
    print(f"The Precision is {round(TP / (TP + FP), 4) * 100}%")

   
    return np.sum(TP), np.sum(FP), np.sum(FN), np.sum(TN)

In [6]:
# Logistic Regression
acc(data, 'predict_lr')

The Accuracy score is: 58.26%
The Missclassification rate is: 41.699999999999996%
The Sensitivity is: 90.10000000000001%
The Specificity is: 33.33%
The Precision is 51.41%


(91, 86, 10, 43)

In [7]:
# Multinominal Naive Bayes
acc(data, 'predict_nb')

The Accuracy score is: 59.57%
The Missclassification rate is: 40.400000000000006%
The Sensitivity is: 92.08%
The Specificity is: 34.11%
The Precision is 52.25%


(93, 85, 8, 44)

In [8]:
# LSTM RNN
acc(data, 'predict_rnn')

The Accuracy score is: 55.65%
The Missclassification rate is: 44.3%
The Sensitivity is: 65.35%
The Specificity is: 48.06%
The Precision is 49.62%


(66, 67, 35, 62)

In [9]:
# Random Forest
acc(data, 'rf_10')

The Accuracy score is: 55.65%
The Missclassification rate is: 44.3%
The Sensitivity is: 91.09%
The Specificity is: 27.91%
The Precision is 49.730000000000004%


(92, 93, 9, 36)

In [10]:
# Multi-Layered Model
acc(data, 'labels')

The Accuracy score is: 63.91%
The Missclassification rate is: 36.1%
The Sensitivity is: 57.43000000000001%
The Specificity is: 68.99%
The Precision is 59.18%


(58, 40, 43, 89)