In [None]:
!pip install transformers==3.1.0

# Load in some Tweets and filtered to vaccine specific ones:

In [26]:
TweetsDF=pd.read_csv( 'twitter_data_cleaned.csv')
TweetsDF=TweetsDF.drop_duplicates(subset=['full_text'])
VaccTweetsDF=TweetsDF[TweetsDF['full_text'].str.lower().str.contains("vac")]
sampledSequences=VaccTweetsDF.full_text.tolist()

In [27]:
len(sampledSequences)

31454

# Use a transformer model (tuned for Natural Language Inference) to look for  Tweets that logically entail the hypothesis of "don't want to get the Covid vaccine."

In [None]:
from transformers import pipeline
import pandas as pd

#classifier = pipeline("zero-shot-classification")

classifier = pipeline("zero-shot-classification", device=0) # to utilize GPU

In [None]:
%%time

sequences=sampledSequences[0:1000]

candidate_labels = ["want", "don't want"]
hypothesis_template = "{} to get the Covid vaccine."

outputs=classifier(sequences, candidate_labels, hypothesis_template=hypothesis_template)#, multi_class=True)

outputDF=pd.DataFrame(outputs)
ExportDF=outputDF.set_index('sequence').apply(pd.Series.explode).reset_index()

# Unfortunately this is too computationally intensive 

I didn't figure out how to clear GPU memory cache programatically on Colab, so only ran it for 1000 tweets with a few manual runtime resets)

In [None]:
ExportDF.to_csv('VaccTweets_Label_1000.csv')

In [16]:
import glob

path = '/content/Labelled' # use your path
all_files = glob.glob( path+ "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

LabelledDF = pd.concat(li, axis=0, ignore_index=True)

In [19]:
LabelledDF.head()

Unnamed: 0.1,Unnamed: 0,sequence,labels,scores
0,0,Please reply to this tweet if you want to shar...,want,0.987453
1,1,Please reply to this tweet if you want to shar...,don't want,0.012547
2,2,Gov. Ron DeSantis giving his rich friends the ...,want,0.980986
3,3,Gov. Ron DeSantis giving his rich friends the ...,don't want,0.019014
4,4,TRIPLE WORRISOME—Data says the new variant tha...,don't want,0.527854


In [21]:
LabelledDF[(LabelledDF.labels=="don't want") & (LabelledDF.scores>0.5)].shape

(108, 4)

In [22]:
antVacc_Tweets=LabelledDF[(LabelledDF.labels=="don't want") & (LabelledDF.scores>0.5)]

In [None]:
antVacc_Tweets.to_csv('antVacc_Tweets.csv', index=False)

#Read back and filter to strong anti-vaccine tweets:

In [4]:
antVacc_Tweets=pd.read_csv('antVacc_Tweets_full.csv')

In [5]:
strong_antVacc_Tweets=antVacc_Tweets[antVacc_Tweets.scores>0.8]

In [6]:
strong_antVacc_Tweets.shape

(52, 17)

# Use a Q&A transformer model to directly "ask" the model what are the "reasons for not taking the vaccine?" based on each of the strong anti-vaccine tweets:

In [103]:
# DEMO

context = ['People have stated that they will never take the vaccine, either because they are anti-vax, conspiracy theorists, or feel that the current vaccines are rushed and possibly dangerous']

QA = pipeline("question-answering")

result = QA(question="what are the top 10 reasons for not taking the vaccine?", context=context[0])
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")



The `max_len` attribute has been deprecated and will be removed in a future version, use `model_max_length` instead.



Answer: 'anti-vax, conspiracy theorists,', score: 0.5293, start: 82, end: 113


In [None]:
context=strong_antVacc_Tweets.full_text[:].tolist()

In [9]:
%%time

# Run on all 52 strong anti-vaccine tweets:

from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad", return_dict=True)

# Examples for context:
# context = ['People have stated that they will never take the vaccine, either because they are anti-vax, conspiracy theorists, or feel that the current vaccines are rushed and possibly dangerous',\
#            'Protesters demonstrate against vaccine coercion, green passports in Tel Aviv. Anti-vax campaigners, and Covid 19 sceptics, can come out with the most offensive, and ridiculous, analogies ',\
#            'Less than 1 week after urging Republicans to block $160B in vaccine funding, Minority Whip @SteveScalise is using his position to elevate Republicans who have publicly urged Americans not to take the COVID-19 vaccine. ',
#            'Community Lattice exists to advance community development by equipping people with the resources they need to create economically sustainable, socially equitable, and resilient places.']

question = "what are the reasons for not taking COVID vaccine?"

answers=[]

for text in context:
    inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]
    text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    outputs = model(**inputs)
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits
    answer_start = torch.argmax(
        answer_start_scores
    )  # Get the most likely beginning of answer with the argmax of the score
    answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

    answers.append(answer)

    #print(f"Question: {question}")
    print(f"Answer: {answer}")

Answer: publicly urged americans not to take the covid - 19 vaccine
Answer: integrity
Answer: 
Answer: protesters demonstrate against vaccine coercion , green passports in tel aviv . anti - vax campaigners , and covid 19 sceptics , can come out with the most offensive , and ridiculous , analogies
Answer: the reasons health care workers have declined are complicated
Answer: they are anti - vax , conspiracy theorists , or feel that the current vaccines are rushed and possibly dangerous
Answer: the amount of anti - vax folks in the beginning of this pandemic and the scarcity of covid - 19
Answer: 
Answer: 
Answer: africans don ’ t like your human / animal experimentations
Answer: they don ’ t want to help biden stop the pandemic
Answer: inaccuracies and misinformation
Answer: 2 florida women busted while dressed up as “ grannies ”
Answer: she doesnt feel well
Answer: 2 florida women busted while dressed up as “ grannies ”
Answer: a sore arm and a rash
Answer: 
Answer: they are labelled an

## Get Sentence-Level Embedding for the Answers:

In [12]:
import tensorflow_hub as hub

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
embeddings = embed(answers)

print(len(embeddings))





52


# Clustering using KMeans and plot with T-SNE to spot clusters of reasons for not wanting to take the vaccines:

In [64]:
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import numpy as np

In [96]:
def tsnescatterplot(sentences):
    USE_embeddings = embed(sentences).numpy()
    word_labels = sentences

    # find tsne coords for 3 dimensions
    tsne = TSNE(n_components=2, random_state=0)
    projections = tsne.fit_transform(USE_embeddings)

    kmeans = KMeans(n_clusters=8, random_state=0).fit(USE_embeddings)

    fig = px.scatter(
        projections, x=0, y=1, #z=2,
        hover_name = word_labels,
        color=kmeans.labels_.astype(str),
        labels={'color': 'kmeans.labels_'}
    )
    fig.show()

In [97]:
tsnescatterplot(answers)

# It looks like the overall approach and the large pre-trained transformer models allow for sematically clustering anti-vaccine reasons on the sentence level. Check out the green cluster (0), light blue (3) and the dark orange cluster (5). Sentences from 0 seens to be talking about immune system, and sentences from 5 are about integrity/ineligiblity/inaccuracies, while those from 3 are generally related to the concept of getting ill/death. Download the html at https://github.com/shuanglovesdata/Mar21-vaccine-uptake/blob/sc/Clustering-Plot.html and open locally to see or interact with the plot.

# Once this workflow is scaled up to thousands of tweets/texts, the clustering will become much more salient and allow the user/partner organization to semantically explore the text media landscape by literally "asking" any questions on the text data through the models.