In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 7.0 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 39.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 9.4 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.8.1 tokenizers-0.12.1 transformers-4.21.1


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
import re
import random

# Storm_Christoph:

In [36]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Hackathon/NSWFloods-full.csv', names = ['id','tweet'])

In [37]:
df

Unnamed: 0,id,tweet
0,D0b68jVnBE4l9f1On001cg,⚠️Updated Moderate #Flood Warning issued for t...
1,ZbOcM3d3basI9BtWuMRAoA,⚠️Initial #Flood Warning issued for the #Hasti...
2,YXie--2WpgYgKknpibF7yg,⚠️Initial #Flood Warning issued for the #Cooks...
3,2Qy9w_29npc8ic3xk4b9pw,⚠️ #Flood Watch issued for the Belubula and Tu...
4,8LvJZ4yjapp_dAToTiQR3A,NSW Flood Watch for Belubula and Tumut Rivers....
...,...,...
15984,DwVOnBIS3Thf3Zru-RXdlg,@98mO0Izh9S6gMjxej_FTXA As long as he is not i...
15985,lqAz46CuTiWWUJPZDKum8A,@7Md9QLKM7IET5g2rCJTUYQ @hnNjuTCG1kjJqgp4KUmhV...
15986,_kLWfTg0Q4-sLMIbCPoRQg,@8h2KzGMamZag4K4hkq7EAQ Oh my.
15987,qBZvLidlzzH9z6QF_KwkZg,@U1B-mS4EDA4dVUwt79OJxw Thank you lovely x


In [38]:
df.drop(columns = 'id', inplace = True)

In [39]:
df

Unnamed: 0,tweet
0,⚠️Updated Moderate #Flood Warning issued for t...
1,⚠️Initial #Flood Warning issued for the #Hasti...
2,⚠️Initial #Flood Warning issued for the #Cooks...
3,⚠️ #Flood Watch issued for the Belubula and Tu...
4,NSW Flood Watch for Belubula and Tumut Rivers....
...,...
15984,@98mO0Izh9S6gMjxej_FTXA As long as he is not i...
15985,@7Md9QLKM7IET5g2rCJTUYQ @hnNjuTCG1kjJqgp4KUmhV...
15986,@8h2KzGMamZag4K4hkq7EAQ Oh my.
15987,@U1B-mS4EDA4dVUwt79OJxw Thank you lovely x


# Filtering English tweets:

In [None]:
# If required run this cell
filt = (df['lang'] == 'en')
en_df = df[filt]
en_df

# Preprocess the data:

In [7]:
# if data needs to be cleaned, run this cell
sentiment_df = pd.DataFrame(columns = ['tweet', 'sentiment'])
sentiment_df['tweet'] = df['tweet']

sentiment_df['tweet'] = sentiment_df['tweet'].apply(lambda x: re.sub(r'https?://\S+', '', x))
sentiment_df['tweet'] = sentiment_df['tweet'].apply(lambda x: re.sub(r'@\S+', '', x))
sentiment_df['tweet'] = sentiment_df['tweet'].apply(lambda x: re.sub(r'[^0-9a-zA-Z- ]+', '', x))

sentiment_df.reset_index(drop = True, inplace = True)
sentiment_df

Unnamed: 0,tweet,sentiment
0,Updated Moderate Flood Warning issued for the ...,
1,Initial Flood Warning issued for the Hastings ...,
2,Initial Flood Warning issued for the Cooks Riv...,
3,Flood Watch issued for the Belubula and Tumut...,
4,NSW Flood Watch for Belubula and Tumut Rivers ...,
...,...,...
15984,As long as he is not in Melbourne or anywhere...,
15985,If they had stopped us then why the following,
15986,Oh my,
15987,Thank you lovely x,


## Ultil Functions (Predictor_class):

In [40]:
#Util functions:
#------------------

# BERT model:
class SentimentModel:
  def __init__(self):
    # self._data = data
    self._tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
    self._model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

  def predict_sentiment(self,tweet):
    tweet = re.sub(r'https?://\S+', '', tweet)
    tweet = re.sub(r'[^0-9a-zA-Z- ]+', '', tweet)
    tokens = self._tokenizer.encode(tweet, return_tensors = 'pt')
    result = self._model(tokens)
    print(f'--THE SENTIMENT OF THE TWEET - {tweet}  : {int(torch.argmax(result.logits)) + 1}\n')


#------------------

# RoBERTa Model:
class ZeroShotClassifier:
  def __init__(self, zs_model = 'roberta-large-mnli'):
    self._classifier = pipeline('zero-shot-classification', model= zs_model)
    self._labels = ['positive', 'neutral', 'negative']
    self._tweets = None

  def _preprocess(self, text):
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'[^0-9a-zA-Z- ]+', '', text)
    return text

  def fit(self, text):
    if isinstance(text,pd.DataFrame):
      self._tweets = text['text'].apply(self._preprocess).values
    self._tweets = self._preprocess(text)

  def predict(self):
    _hypothesis_template = 'The sentiment of this review is {}.'
    if isinstance(self._tweets,list):
      for tweet in self._tweets:
        _sentiment = self._classifier(tweet, self._labels, hypothesis_template = _hypothesis_template)
        print(f'--THE SENTIMENT OF THE TWEET - {tweet}  : {_sentiment["labels"][0]}\n')
    else:
      _sentiment = self._classifier(self._tweets, self._labels, hypothesis_template = _hypothesis_template)
      print(f'--THE SENTIMENT OF THE TWEET - {self._tweets}  : {_sentiment["labels"][0]}\n')
            

# BERT Pretrained model:

In [None]:
model =  SentimentModel()

In [None]:
model.predict_sentiment(sentiment_df['tweet'][100])

#Zero shot classification(RoBERTa) for postive,neutral and negative sentiment:

In [41]:
ZSModel = ZeroShotClassifier()

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [42]:
# test prediction
tweet = sentiment_df['tweet'][456]
ZSModel.fit(tweet)
ZSModel.predict()

--THE SENTIMENT OF THE TWEET -      This isnt good enough How are ppl meant to work kids get to school and families access services or simply shop This isnt 18th century living  : negative



#ZeroShotClassification(RoBERTa) Model:

Performs unsupervised sentiment analysis of each tweet in sentiment_df:

In [45]:
#Util Function:
#--------------
classifier = pipeline('zero-shot-classification',model = 'roberta-large-mnli')
labels = ['positive', 'neutral', 'negative']
hypothesis_template = 'The sentiment of this review is {}.'
mark = 0    

def predict_sentiment_zsc(tweet):
  global mark
  mark+=1
  if mark%10==0:
    print(f'Processed {mark} tweets')
    mark == 0
  sentiment = classifier(tweet, labels, hypothesis_template = hypothesis_template)
  if sentiment['labels'][0] == 'positive':
    return 1
  if sentiment['labels'][0] == 'neutral':
    return 0
  if sentiment['labels'][0] == 'negative':
    return -1 

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
sentiment_df['sentiment'] = df['tweet'].apply(predict_sentiment_zsc)

In [None]:
sentiment_df.to_csv('nswflood_full_Sentiment(RoBERTa).csv')