# Sentiment Analysis Using APIs

## Install and Import Required Libraries

In [1]:
%%capture
!pip install spacy
!pip install spacytextblob
!python3 -m spacy download en_core_web_sm

In [2]:
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
import numpy as np
from google.cloud import language_v1
import six
from google.cloud import bigquery
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
from google.cloud import storage
import datetime

# For model evaluation
from sklearn.metrics import f1_score, cohen_kappa_score, mean_squared_error
from sklearn.metrics import mean_absolute_error, precision_score, recall_score

# Confusion Matrix
from sklearn.metrics import confusion_matrix
import math
from itertools import islice

In [3]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('spacytextblob')

<spacytextblob.spacytextblob.SpacyTextBlob at 0x7f952850e190>

## User Input

Sensitive information has been replaced with `*****` in the following user inputs

In [4]:
# GSUTIL URI of CSV File with Cleaned/Original Tweets (e.g. "gs://<bucket_name>/<folder>/<file_name>")
bucket = '*****'
data_path = '*****'
gsutil_uri = f'gs://{bucket}/{data_path}/original_and_cleaned_with_labels_600_elements.csv'
project_id = '*****'
dataset_id = 'Corona_NLP'

# Column names in the CSV file for clean and original tweets
clean_tweets = 'CleanedTweet'
original_tweets = 'OriginalTweet'

# Google NLP API BigQuery Table name
google_table_id = 'google_predictions'

# SpaCy NLP API BigQuery Table name
spacy_table_id = 'spacy_predictions'

# Table Name for API Evaluations to be saved to
evaluation_table_id = 'api_evaluation'

# Table name for confusion matrices
google_cm_id = 'google_cm'
spacy_cm_id = 'spacy_cm'
google_pct_cm_id = 'google_cm_percentage'
spacy_pct_cm_id = 'spacy_cm_percentage'

In [5]:
google_table_id = f"{project_id}.{dataset_id}.{google_table_id}"
spacy_table_id = f"{project_id}.{dataset_id}.{spacy_table_id}"
evaluation_table_id = f"{project_id}.{dataset_id}.{evaluation_table_id}"
google_cm_table_id = f"{project_id}.{dataset_id}.{google_cm_id}"
spacy_cm_table_id = f"{project_id}.{dataset_id}.{spacy_cm_id}"
google_cm_pct_table_id = f"{project_id}.{dataset_id}.{google_pct_cm_id}"
spacy_cm_pct_table_id = f"{project_id}.{dataset_id}.{spacy_pct_cm_id}"

## Shared Functions

In [7]:
def read_from_cs(gsutil_uri):
    """ 
    Reads a CSV file from GCS and returns a pandas dataframe of that file. 
    Args:
        gsutil_uri (str): Gsutil URI of the CSV file
    """
    data = pd.read_csv(gsutil_uri)
    test_df = data[data['Set'] == 'TEST']
    return test_df

def round_score(val):
    """
    Rounds the input value to a whole number: 0, 1 or 2. The APIs make a float prediction of the sentiment,
    which falls between -1 and 1, with -1 being negative and 1 being positive sentiment. The true
    sentiments take integer value of 0 (negative), 1 (neutral) or 2 (positive). Therefore
    the predictions have been converted to integers 0, 1 or 2. E.g. a predicted sentiment of -0.6
    would be converted to 0, a negative sentiment.
    
    Args:
        val (float): a float representing a predicted sentiment, taking a value between -1 and 1.
    """
    if val <= -0.4:
        return 0
    elif -0.3 <= val <= 0.3:
        return 1
    else:
        return 2

In [8]:
def load_to_bq(table_id, data, write_disposition):
    """
    Load a pandas dataframe to a BigQuery table.
    
    Args:
        table_id: BigQuery table name (must be created already)
        data: The dataframe to load
        write_disposition: method of writing the data, e.g. "WRITE_APPEND" appends to a table
    """
    client = bigquery.Client()
    job_config = bigquery.LoadJobConfig(
        write_disposition=write_disposition)
    job = client.load_table_from_dataframe(
        data, table_id, job_config=job_config)
    table = client.get_table(table_id)

In [9]:
# Load dataframe
df = read_from_cs(gsutil_uri)
df = df[[clean_tweets, 'Sentiment']]

In [10]:
df.head()

Unnamed: 0,CleanedTweet,Sentiment
480,amid overall panic nonavailability essential f...,0
481,thought company hero health crisis coronavirus...,2
482,boomer looked millennials killing brick mortar...,0
483,idiot stole center console last night take han...,0
484,get creative confinement week launched x submi...,2


## Google NLP API

In [11]:
def google_analyze_sentiment(content):
    """
    Analyse the sentiment of a tweet using the Google NLP API. Returns the predicted sentiment 
    with a float value between -1 and 1, -1 being negative and 1 being positive.
    
    Args:
        content: the string to be analysed
    """
    client = language_v1.LanguageServiceClient()
    if isinstance(content, six.binary_type):
        content = content.decode("utf-8")
    type_ = language_v1.Document.Type.PLAIN_TEXT
    document = {"type_": type_, "content": content}
    response = client.analyze_sentiment(request={"document": document})
    sentiment = response.document_sentiment
    return sentiment.score

In [12]:
def google_get_predictions(data, tweet_colunm):
    """
    Calls the tweet analysis function to get a sentiment prediction from a tweet. Rounds
    the predicted sentiment value to those present in the actual sentiment. 
    
    Args:
        data: the dataframe for sentiment analysis
        tweet_column: the name of the column containing the tweets to be analysed
    """
    data['clean_pred'] = data[tweet_colunm].apply(google_analyze_sentiment)
    data['clean_pred'] = data['clean_pred'].apply(round_score)
    return data

In [13]:
# Make predictions
google_predictions_df = google_get_predictions(df, clean_tweets)

In [14]:
google_predictions_df.head()

Unnamed: 0,CleanedTweet,Sentiment,clean_pred
480,amid overall panic nonavailability essential f...,0,0
481,thought company hero health crisis coronavirus...,2,1
482,boomer looked millennials killing brick mortar...,0,0
483,idiot stole center console last night take han...,0,0
484,get creative confinement week launched x submi...,2,2


In [15]:
# Save to BigQuery
load_to_bq(google_table_id, google_predictions_df, write_disposition="WRITE_TRUNCATE")

## SpaCY NLP API

In [16]:
def spacy_analyze_sentiment(tweet):
    """
    Call the SpaCy API to get a sentiment prediction from a tweet. Outputs a sentiment value
    between -1 and 1, with -1 being the most negative and 1 being most positive. 
    
    Args:
        tweet (str): the tweet to be analysed
    """
    doc = nlp(tweet)
    sentiment = doc._.blob.polarity
    return sentiment

In [17]:
def spacy_get_predictions(data, tweet_colunm):
    """
    Gets a prediction of the sentiment of tweets in a dataframe. 
    
    Args:
        data: the dataframe to be analysed
        tweet_column: the name of the column containing the tweets
    """
    data['clean_pred'] = data[tweet_colunm].apply(spacy_analyze_sentiment)
    data['clean_pred'] = data['clean_pred'].apply(round_score)
    return data

In [18]:
# Make predictions
spacy_predictions_df = spacy_get_predictions(df, clean_tweets)

In [19]:
spacy_predictions_df.head()

Unnamed: 0,CleanedTweet,Sentiment,clean_pred
480,amid overall panic nonavailability essential f...,0,1
481,thought company hero health crisis coronavirus...,2,1
482,boomer looked millennials killing brick mortar...,0,1
483,idiot stole center console last night take han...,0,1
484,get creative confinement week launched x submi...,2,2


In [20]:
# Save to BigQuery
load_to_bq(spacy_table_id, spacy_predictions_df, write_disposition="WRITE_TRUNCATE")

## Prediction Evaluations

In [21]:
def load_from_bq(table_ref): 
    """
    Load a table from BigQuery as a pandas dataframe. 
    
    Args:
        table_ref: The BigQuery table name - {project}.{dataset}.{table}
    """
    client = bigquery.Client()
    table = client.get_table(table_ref)
    df = client.list_rows(table).to_dataframe()
    return df

In [22]:
def get_evaluation(y_true, y_pred, api_name):
    """
    Get evaluation scores using the predicted and real values.
    
    Args:
        y_true: list of actual values
        y_pred: list of predicted values
        api_name: name of the api being used to use as an identifier in the evaluation table
    """
    f1 = f1_score(y_true, y_pred, average="weighted")
    lk = cohen_kappa_score(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average="weighted")
    recall = recall_score(y_true, y_pred, average="weighted")
    evaluation_results = {'time': datetime.datetime.now().strftime('%A %d %B %Y %I:%M:%S%p'),
                          'API': api_name,
                          'f1Score': f1,
                          'linearKappa': lk, 
                          'meanSquaredError': mse, 
                          'meanAbsoluteError': mae, 
                          'precision': precision, 
                          'recall': recall}
    return evaluation_results

In [23]:
# Get SpaCy Predictions
spacy_df = load_from_bq(spacy_table_id)
spacy_y_true = spacy_df['Sentiment'].tolist()
spacy_y_pred = spacy_df['clean_pred'].tolist()
spacy_evaluations = get_evaluation(spacy_y_true, spacy_y_pred, "spacy")
spacy_evaluations_df = pd.DataFrame(spacy_evaluations, index=[0])

In [24]:
spacy_evaluations_df.head()

Unnamed: 0,time,API,f1Score,linearKappa,meanSquaredError,meanAbsoluteError,precision,recall
0,Wednesday 25 January 2023 02:56:35PM,spacy,0.338236,0.1,0.75,0.65,0.509456,0.4


In [25]:
# Get Google Predictions
google_df = load_from_bq(google_table_id)
google_y_true = google_df['Sentiment'].tolist()
google_y_pred = google_df['clean_pred'].tolist()
google_evaluations = get_evaluation(google_y_true, google_y_pred, "google")
google_evaluations_df = pd.DataFrame(google_evaluations, index=[0])

In [26]:
google_evaluations_df.head()

Unnamed: 0,time,API,f1Score,linearKappa,meanSquaredError,meanAbsoluteError,precision,recall
0,Wednesday 25 January 2023 02:56:40PM,google,0.46694,0.2,0.683333,0.583333,0.480331,0.466667


In [27]:
# Load to BQ Table
load_to_bq(evaluation_table_id, spacy_evaluations_df, write_disposition="WRITE_APPEND")
load_to_bq(evaluation_table_id, google_evaluations_df, write_disposition="WRITE_APPEND")

## Confusion Matrix

In [28]:
def convert_cm_to_percentage(cm):
    """
    Takes a confusion matrix as input and outputs a confusion matrix with predictions as a 
    percentage of the total true values. 
    
    Args:
        cm: confusion matrix to convert
    """
    confusion_percentage_accurracies = []
    for i in cm:
        for j in i:
            confusion_percentage_accurracies.append(j/int(sum(i)))


    cm_size = int(math.sqrt(len(confusion_percentage_accurracies)))
    length_to_split = [cm_size] * cm_size
    iter_item = iter(confusion_percentage_accurracies)
    confusion_percentage_array = [list(islice(iter_item, elem))
            for elem in length_to_split]
    
    return confusion_percentage_array


In [29]:
def create_cm(y_true, y_pred):
    """
    Create a confusion matrix from real and predicted values.
    Args:
        y_true: list of actual values
        y_pred: list of predicted values
    """
    return confusion_matrix(y_true, y_pred, labels = [0,1,2])

In [111]:
def create_cm_df(cm, model_id):
    """
    Create a pandas dataframe of the confusion matrix, containing model id and more
    detailed labelling for columns and rows.
    
    Args:
        cm: confusion matrix 
        model_id: ID of the model to use as a column value
    """
    pred_cols = ['Negative_pred', 'Neutral_pred', 'Positive_pred']
    actual_cols = ['Negative', 'Neutral', 'Positive']
    df = pd.DataFrame(cm, columns=pred_cols)
    df.insert(loc=0, column='model_id', value=[model_id, model_id, model_id])
    df.insert(loc=1, column='Sentiment_actuals', value=actual_cols)
    return df

In [109]:
# Create confusion matrices of both APIs as integers and percentages
google_cm = create_cm(google_y_true, google_y_pred)
spacy_cm = create_cm(spacy_y_true, spacy_y_pred)

spacy_cm_percentage = convert_cm_to_percentage(spacy_cm)
google_cm_percentage = convert_cm_to_percentage(google_cm)

In [110]:
print(google_cm)
print(spacy_cm)

[[13  6  1]
 [ 8  7  5]
 [ 2 10  8]]
[[ 2 15  3]
 [ 1 17  2]
 [ 0 15  5]]


In [112]:
# Create dataframes of confusion matrices that can be uploaded to BQ
google_model_id = ("google_" + str(datetime.datetime.now().strftime('%d-%m-%Y_%I:%M:%S')))
spacy_model_id = ("spacy_" + str(datetime.datetime.now().strftime('%d-%m-%Y_%I:%M:%S')))

google_cm_df = create_cm_df(google_cm, google_model_id)
spacy_cm_df = create_cm_df(spacy_cm, spacy_model_id)

google_cm_percentage_df = create_cm_df(google_cm_percentage, google_model_id)
spacy_cm_percentage_df = create_cm_df(spacy_cm_percentage, spacy_model_id)

In [114]:
google_cm_df

Unnamed: 0,model_id,Sentiment_actuals,Negative_pred,Neutral_pred,Positive_pred
0,google_25-01-2023_03:12:04,Negative,13,6,1
1,google_25-01-2023_03:12:04,Neutral,8,7,5
2,google_25-01-2023_03:12:04,Positive,2,10,8


In [115]:
spacy_cm_df

Unnamed: 0,model_id,Sentiment_actuals,Negative_pred,Neutral_pred,Positive_pred
0,spacy_25-01-2023_03:12:04,Negative,2,15,3
1,spacy_25-01-2023_03:12:04,Neutral,1,17,2
2,spacy_25-01-2023_03:12:04,Positive,0,15,5


In [116]:
# Save confusion matrices to BQ
load_to_bq(google_cm_table_id, google_cm_df, "WRITE_APPEND")
load_to_bq(spacy_cm_table_id, spacy_cm_df, "WRITE_APPEND")
load_to_bq(google_cm_pct_table_id, google_cm_percentage_df, "WRITE_APPEND")
load_to_bq(spacy_cm_pct_table_id, spacy_cm_percentage_df, "WRITE_APPEND")