In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /Users/afrazaman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/afrazaman/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/afrazaman/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/afrazaman/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/afrazaman/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
# Read in data
df = pd.read_csv('d1_labled.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,id
0,0,@hoemiceder also the same is with my depressio...,640598812638232576
1,1,who the fuck let me watch the story of my life...,640582764358729728
2,2,@Atypical_LDS I have been diagnosed with depre...,640060061050499072
3,3,@markiplier hey I love that you did this chari...,639953051609120768
4,4,@remove32 I have been diagnosed with depressio...,639249254813315074


In [4]:
df.drop('id', inplace=True, axis=1)

In [5]:
df = df.reset_index().rename(columns={'Unnamed: 0': 'id'})

In [6]:
df.head()

Unnamed: 0,index,id,text
0,0,0,@hoemiceder also the same is with my depressio...
1,1,1,who the fuck let me watch the story of my life...
2,2,2,@Atypical_LDS I have been diagnosed with depre...
3,3,3,@markiplier hey I love that you did this chari...
4,4,4,@remove32 I have been diagnosed with depressio...


In [7]:
df.drop('index', inplace=True, axis=1)
df.head()

Unnamed: 0,id,text
0,0,@hoemiceder also the same is with my depressio...
1,1,who the fuck let me watch the story of my life...
2,2,@Atypical_LDS I have been diagnosed with depre...
3,3,@markiplier hey I love that you did this chari...
4,4,@remove32 I have been diagnosed with depressio...


In [9]:
#(6470, 2)
df.shape

(6470, 2)

# Basic NLTK

In [10]:
example = df['text'][50]
print(example)

I have been diagnosed with: POST CONCERT DEPRESSION   @5SOS   #ROWYSOTour 

(06-05-2015)


In [11]:
tokens = nltk.word_tokenize(example)
tokens[:10]

['I',
 'have',
 'been',
 'diagnosed',
 'with',
 ':',
 'POST',
 'CONCERT',
 'DEPRESSION',
 '@']

In [12]:
tagged = nltk.pos_tag(tokens)
tagged[:10]

[('I', 'PRP'),
 ('have', 'VBP'),
 ('been', 'VBN'),
 ('diagnosed', 'VBN'),
 ('with', 'IN'),
 (':', ':'),
 ('POST', 'NNP'),
 ('CONCERT', 'NNP'),
 ('DEPRESSION', 'NNP'),
 ('@', 'NNP')]

In [13]:
entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()

(S
  I/PRP
  have/VBP
  been/VBN
  diagnosed/VBN
  with/IN
  :/:
  (ORGANIZATION POST/NNP)
  CONCERT/NNP
  DEPRESSION/NNP
  @/NNP
  5SOS/CD
  #/#
  ROWYSOTour/NNP
  (/(
  06-05-2015/CD
  )/))


# RoBERTa

In [14]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [15]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()

In [16]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [17]:
# Run for Roberta Model
encoded_text = tokenizer(example, return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
    'Negative' : scores[0]*100,
    'Neutral' : scores[1]*100,
    'Positive' : scores[2]*100
}
print(scores_dict)

{'Negative': 62.8783643245697, 'Neutral': 34.785327315330505, 'Positive': 2.3363040760159492}


In [18]:
#written the process in function to see the result of one sentence
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
      'Negative' : scores[0]*100,
      'Neutral' : scores[1]*100,
      'Positive' : scores[2]*100
    }
    return scores_dict

In [19]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        text = row['text']
        myid = row['id']
       
        roberta_result = polarity_scores_roberta(text)
        
        res[myid] = polarity_scores_roberta(text)
    except RuntimeError:
        print(f'Broke for id {myid}')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/6470 [00:00<?, ?it/s]

In [33]:
results_df = pd.DataFrame(res).T
results_df

Unnamed: 0,Negative,Neutral,Positive
0,9.199365,38.887912,51.912725
1,97.863883,1.968073,0.168050
2,21.727206,41.750872,36.521921
3,0.417567,1.248977,98.333460
4,50.494641,46.883336,2.622027
...,...,...,...
6488,45.552954,46.042645,8.404405
6489,95.850962,3.680779,0.468246
6490,4.824481,39.570728,55.604792
6491,88.312870,10.159244,1.527890


In [34]:
results_df = results_df.reset_index().rename(columns={'index': 'id'})
results_df

Unnamed: 0,id,Negative,Neutral,Positive
0,0,9.199365,38.887912,51.912725
1,1,97.863883,1.968073,0.168050
2,2,21.727206,41.750872,36.521921
3,3,0.417567,1.248977,98.333460
4,4,50.494641,46.883336,2.622027
...,...,...,...,...
6465,6488,45.552954,46.042645,8.404405
6466,6489,95.850962,3.680779,0.468246
6467,6490,4.824481,39.570728,55.604792
6468,6491,88.312870,10.159244,1.527890


In [35]:
results_df = results_df.merge(df, how="left")
results_df.head()

Unnamed: 0,id,Negative,Neutral,Positive,text
0,0,9.199365,38.887912,51.912725,@hoemiceder also the same is with my depressio...
1,1,97.863883,1.968073,0.16805,who the fuck let me watch the story of my life...
2,2,21.727206,41.750872,36.521921,@Atypical_LDS I have been diagnosed with depre...
3,3,0.417567,1.248977,98.33346,@markiplier hey I love that you did this chari...
4,4,50.494641,46.883336,2.622027,@remove32 I have been diagnosed with depressio...


In [36]:
results_df.to_csv('d1_labled_6k_RoBERTa.csv')

In [37]:
results_df.drop('Neutral', inplace=True, axis=1)

In [38]:
results_df

Unnamed: 0,id,Negative,Positive,text
0,0,9.199365,51.912725,@hoemiceder also the same is with my depressio...
1,1,97.863883,0.168050,who the fuck let me watch the story of my life...
2,2,21.727206,36.521921,@Atypical_LDS I have been diagnosed with depre...
3,3,0.417567,98.333460,@markiplier hey I love that you did this chari...
4,4,50.494641,2.622027,@remove32 I have been diagnosed with depressio...
...,...,...,...,...
6465,6488,45.552954,8.404405,Hey if any of you cut yourself or used too.. I...
6466,6489,95.850962,0.468246,I'm diagnosed with depression but I'm not cutt...
6467,6490,4.824481,55.604792,@TweetsEncourage @SoBlessedKyrie I'm diagnosed...
6468,6491,88.312870,1.527890,I'm diagnosed with depression... I don't tweet...


In [39]:
results_df.drop('Positive', inplace=True, axis=1)

In [40]:
results_df

Unnamed: 0,id,Negative,text
0,0,9.199365,@hoemiceder also the same is with my depressio...
1,1,97.863883,who the fuck let me watch the story of my life...
2,2,21.727206,@Atypical_LDS I have been diagnosed with depre...
3,3,0.417567,@markiplier hey I love that you did this chari...
4,4,50.494641,@remove32 I have been diagnosed with depressio...
...,...,...,...
6465,6488,45.552954,Hey if any of you cut yourself or used too.. I...
6466,6489,95.850962,I'm diagnosed with depression but I'm not cutt...
6467,6490,4.824481,@TweetsEncourage @SoBlessedKyrie I'm diagnosed...
6468,6491,88.312870,I'm diagnosed with depression... I don't tweet...


In [41]:
results_df.to_csv('d1_labled_6k_Neg_RoBERTa.csv')

In [42]:
results_df['level'] = ''

In [43]:
results_df

Unnamed: 0,id,Negative,text,level
0,0,9.199365,@hoemiceder also the same is with my depressio...,
1,1,97.863883,who the fuck let me watch the story of my life...,
2,2,21.727206,@Atypical_LDS I have been diagnosed with depre...,
3,3,0.417567,@markiplier hey I love that you did this chari...,
4,4,50.494641,@remove32 I have been diagnosed with depressio...,
...,...,...,...,...
6465,6488,45.552954,Hey if any of you cut yourself or used too.. I...,
6466,6489,95.850962,I'm diagnosed with depression but I'm not cutt...,
6467,6490,4.824481,@TweetsEncourage @SoBlessedKyrie I'm diagnosed...,
6468,6491,88.312870,I'm diagnosed with depression... I don't tweet...,


In [44]:
results_df.loc[results_df['Negative'] >= 80, 'level'] = 'Severe'
results_df.loc[(results_df['Negative'] >= 30) & (results_df['Negative'] < 80), 'level'] = 'Moderate'
results_df.loc[results_df['Negative'] < 30, 'level'] = 'Mild'

In [45]:
results_df

Unnamed: 0,id,Negative,text,level
0,0,9.199365,@hoemiceder also the same is with my depressio...,Mild
1,1,97.863883,who the fuck let me watch the story of my life...,Severe
2,2,21.727206,@Atypical_LDS I have been diagnosed with depre...,Mild
3,3,0.417567,@markiplier hey I love that you did this chari...,Mild
4,4,50.494641,@remove32 I have been diagnosed with depressio...,Moderate
...,...,...,...,...
6465,6488,45.552954,Hey if any of you cut yourself or used too.. I...,Moderate
6466,6489,95.850962,I'm diagnosed with depression but I'm not cutt...,Severe
6467,6490,4.824481,@TweetsEncourage @SoBlessedKyrie I'm diagnosed...,Mild
6468,6491,88.312870,I'm diagnosed with depression... I don't tweet...,Severe


In [46]:
results_df.to_csv('d1_labled_6k_Neg_level_RoBERTa.csv')