<a href="https://colab.research.google.com/github/richardOlson/nlp__tranformers/blob/main/reddit_tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is the Notebook that is used to do the tagging of Reddit data

In [1]:
from google.colab import files
import pandas as pd
import numpy as np
import spacy

In [None]:
files.upload()

In [3]:
# getting the small english model from spacy
! python -m spacy download en_core_web_sm -q

[K     |████████████████████████████████| 12.0 MB 21.8 MB/s 
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [4]:
# now getting setup to find the entities that are in the dataframe
nlp = spacy.load("en_core_web_sm")

In [5]:
# reading in the dataframe
df = pd.read_csv("/content/reddit .csv", sep="|", )
print(f"The shape is {df.shape}")
df.head()

The shape is (4001, 9)


Unnamed: 0,name,created_utc,subreddit,title,selftext,upvote_ratio,ups,downs,score
0,t3_p220ix,1628642000.0,investing,Is it a pragmatic decision to allocate a certa...,My total net worth across all of my accounts i...,0.91,9,0,9
1,t3_p2131v,1628638000.0,investing,"FGI, Steady Downtrend Since December (Peak 92 ...","Not making any statements, assumptions or pred...",0.5,0,0,0
2,t3_p2080p,1628636000.0,investing,Nintendo - hold or sell currently at a loss?,I am currently heavily invested in NTDOY and d...,0.82,11,0,11
3,t3_p1z7c2,1628632000.0,investing,Intellectual Property and Valuation: Some Thou...,**PREFACE: This is information not advice. I...,0.78,7,0,7
4,t3_p1x594,1628626000.0,investing,Coinbase beats earnings estimates for Q2,&amp;#x200B;\n\n* Eps $6.42 v $2.26 Est\n* Rev...,0.91,278,0,278


In [None]:
# want to find the organization for each of the rows in the column selftext and 
# we will return the "ent"--organization for each of the columns
def get_entities(text:str):
  doc = nlp(text=text)
  # making the list for the entities
  org_list = []
  for entity in list(doc.ents):
    if entity.label_ == "ORG":
      org_list.append(entity.text)
    # removing the duplicates with making a set 
    # then converting back to a list
  org_list = list(set(org_list))
  return org_list



In [None]:
# we are now going to apply this to each of the rows
df["orgs"] = df["selftext"].apply(get_entities)

In [None]:
df.head()

In [None]:
# saving the dataframe 
df.to_csv("reddit_tagging.csv", sep="|")

In [None]:
files.download("reddit_tagging.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [6]:
# doing the count of what is the most common orgs
from collections import  Counter

In [11]:
def count_function(df, column):
  """
  This is the function that will count what the orgs are
  in the dataframe.
  """
  c = Counter()
  # using a generator
  r = (row for row in df[column])
  # getting the list out of the generator
  for theList in r:
    if not isinstance(theList, list):
      theList = list(theList)
    c.update(theList)
  return c


In [50]:
# using the method above to get the count
c = count_function(df,column="org")

In [None]:
c.most_common(50)

[('FAQ', 3017),
 ('ETF', 2304),
 ('Fed', 1206),
 ('Amazon', 878),
 ('SEC', 878),
 ('Apple', 877),
 ('EV', 764),
 ('Microsoft', 715),
 ('Intel', 714),
 ('VOO', 713),
 ('Fidelity', 713),
 ('COVID', 660),
 ('TSLA', 658),
 ('QQQ', 658),
 ('MSFT', 605),
 ('DCF', 605),
 ('AMD', 604),
 ('Google', 603),
 ('fed', 603),
 ('Vanguard', 602),
 ('JPM', 495),
 ('EPS', 494),
 ('NASDAQ', 491),
 ('NYSE', 440),
 ('Alibaba', 440),
 ('etf', 440),
 ('CNBC', 439),
 ('treasury', 384),
 ('Treasury', 384),
 ('ROI', 384),
 ('ITM', 382),
 ('AAPL', 330),
 ('Robinhood', 329),
 ('INTC', 329),
 ('IBM', 329),
 ('FDA', 329),
 ('TD Ameritrade', 329),
 ('ARKK', 329),
 ('Yahoo', 329),
 ('FCF', 329),
 ('Nasdaq', 328),
 ('OTM', 327),
 ('SPY', 327),
 ('Reuters', 275),
 ('WSJ', 275),
 ('EU', 275),
 ('CFO', 275),
 ('USD', 275),
 ('GM', 274),
 ('EBITDA', 274)]

In [7]:
# we are going to create a blackList to remove some of the entities that 
# we don't want to have to help clean this up
blackList = ["ev", "sec", "faq", "nasdaq", "treasury",  "nyse", "fda", 
             "etf", "vanguard", "fidelity", "roi", "fed", "td ameritrade", "robinhood", "cnbc", "eu", "reuters", ]

In [8]:
# adding the blackList to the function that gets the entities from the text
# want to find the organization for each of the rows in the column selftext and 
# we will return the "ent"--organization for each of the columns
def get_entities(text:str):
  doc = nlp(text=text)
  # making the list for the entities
  org_list = []
  for entity in list(doc.ents):
    if entity.label_ == "ORG" and entity.text.lower() not in blackList:
      org_list.append(entity.text)
    # removing the duplicates with making a set 
    # then converting back to a list
  org_list = list(set(org_list))
  return org_list


In [9]:
df["org"] = df['selftext'].apply(get_entities)

In [12]:
# running through the counter again
c = count_function(df, column="org")

In [13]:
c.most_common(50)

[('Apple', 81),
 ('Amazon', 72),
 ('Microsoft', 63),
 ('MSFT', 62),
 ('Google', 61),
 ('DCF', 57),
 ('Intel', 57),
 ('COVID', 57),
 ('VOO', 54),
 ('AMD', 50),
 ('TSLA', 50),
 ('QQQ', 47),
 ('JPM', 40),
 ('AAPL', 39),
 ('EPS', 39),
 ('Alibaba', 39),
 ('Tesla', 29),
 ('EBITDA', 28),
 ('SCHD', 25),
 ('CFO', 25),
 ('ARKK', 25),
 ('AMC', 24),
 ('ADR', 24),
 ('Samsung', 24),
 ('YOY', 24),
 ('WSJ', 24),
 ('Yahoo', 24),
 ('ITM', 24),
 ('NVDA', 23),
 ('GM', 23),
 ('IBM', 23),
 ('USD', 23),
 ('FCF', 23),
 ('INTC', 22),
 ('Bank of America', 22),
 ('Facebook', 20),
 ('Congress', 20),
 ('ATH', 20),
 ('VXUS', 20),
 ('the New York Stock Exchange', 20),
 ('PEG', 20),
 ('Chase', 20),
 ('Summary', 20),
 ('DIDI', 20),
 ('Didi', 20),
 ('TSMC', 20),
 ('VYM', 20),
 ('the Federal Reserve', 20),
 ('IP', 19),
 ('GAAP', 19)]

In [14]:
# using the flair library to find the flair
import pandas as pd
! pip install flair -q

[K     |████████████████████████████████| 284 kB 27.1 MB/s 
[K     |████████████████████████████████| 981 kB 49.3 MB/s 
[K     |████████████████████████████████| 776.8 MB 15 kB/s 
[K     |████████████████████████████████| 19.7 MB 39 kB/s 
[K     |████████████████████████████████| 1.2 MB 53.0 MB/s 
[K     |████████████████████████████████| 64 kB 2.7 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 788 kB 66.4 MB/s 
[K     |████████████████████████████████| 43 kB 2.1 MB/s 
[K     |████████████████████████████████| 2.6 MB 44.8 MB/s 
[K     |████████████████████████████████| 62 kB 830 kB/s 
[K     |████████████████████████████████| 895 kB 53.9 MB/s 
[K     |████████████████████████████████| 636 kB 70.8 MB/s 
[K     |████████████████████████████████| 3.3 MB 43.7 MB/s 
[?25h  Building wheel for gdown (PEP 517) ...

In [15]:
import flair

In [16]:
model = flair.models.TextClassifier.load("en-sentiment")

2021-08-12 02:07:33,541 https://nlp.informatik.hu-berlin.de/resources/models/sentiment-curated-distilbert/sentiment-en-mix-distillbert_4.pt not found in cache, downloading to /tmp/tmpu5j9cnsd


100%|██████████| 265512723/265512723 [00:09<00:00, 27742935.51B/s]

2021-08-12 02:07:43,590 copying /tmp/tmpu5j9cnsd to cache at /root/.flair/models/sentiment-en-mix-distillbert_4.pt





2021-08-12 02:07:44,078 removing temp file /tmp/tmpu5j9cnsd
2021-08-12 02:07:44,118 loading file /root/.flair/models/sentiment-en-mix-distillbert_4.pt


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




In [17]:
def get_sentiment(text):
  sentence = flair.data.Sentence(text)
  model.predict(sentence)
  sentiment = sentence.labels[0]
  return sentiment

In [18]:
# creating the senitment column
df["sentiment"] = df["selftext"].apply(get_sentiment)

In [19]:
df.head()

Unnamed: 0,name,created_utc,subreddit,title,selftext,upvote_ratio,ups,downs,score,org,sentiment
0,t3_p220ix,1628642000.0,investing,Is it a pragmatic decision to allocate a certa...,My total net worth across all of my accounts i...,0.91,9,0,9,"[ETH, Target, BTC]",NEGATIVE (0.9982)
1,t3_p2131v,1628638000.0,investing,"FGI, Steady Downtrend Since December (Peak 92 ...","Not making any statements, assumptions or pred...",0.5,0,0,0,"[FGI, CNN Money]",NEGATIVE (0.9937)
2,t3_p2080p,1628636000.0,investing,Nintendo - hold or sell currently at a loss?,I am currently heavily invested in NTDOY and d...,0.82,11,0,11,[Nintendo World],NEGATIVE (1.0)
3,t3_p1z7c2,1628632000.0,investing,Intellectual Property and Valuation: Some Thou...,**PREFACE: This is information not advice. I...,0.78,7,0,7,"[AAPL, Facebook, the IP &amp, FAANG, Operation...",NEGATIVE (0.9998)
4,t3_p1x594,1628626000.0,investing,Coinbase beats earnings estimates for Q2,&amp;#x200B;\n\n* Eps $6.42 v $2.26 Est\n* Rev...,0.91,278,0,278,"[Verified Users, Transacting Users, Eps, &amp;...",NEGATIVE (0.9799)


In [25]:
print(type(df['org'].iloc[0]))
type(df['sentiment'].iloc[0])

<class 'list'>


flair.data.Label

In [23]:
# saving to a file and then doing the download
df.to_csv("reddit.csv", sep="|", index=False)

In [24]:
files.download("reddit.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [32]:
df["sentiment"].iloc[0].value

'NEGATIVE'

In [33]:
# we now want to try to find the what the setiment is for each of the labels

sentiment = {}
# making a couner to
# see how many a few of the 
# rows
myCounter = 0
# doing the looping of the dataframe
for row in df.itertuples():
  
  org_list = row[10] # this is the org column -- a list
  sent = row[11]  # this is the sentiment column -- this is a flair.Sentence label
  pos_neg = sent.value
  score = sent.score
  
  # doing the looping of the orgs in the list
  for org in org_list:
    if org not in sentiment:
      sentiment[org] = {"POSITIVE": [], "NEGATIVE": []}
    # doing the adding of the values
    sentiment[org][pos_neg].append(score)
   
  

In [35]:
sentiment["ARK"]

{'NEGATIVE': [],
 'POSITIVE': [0.89322829246521,
  0.9906964302062988,
  0.9274241328239441,
  0.89322829246521,
  0.9906964302062988,
  0.9274241328239441,
  0.89322829246521,
  0.9906964302062988,
  0.9274241328239441,
  0.89322829246521,
  0.9906964302062988,
  0.9274241328239441,
  0.89322829246521]}

In [37]:
theList = []
sum(theList)

0

In [39]:
# going to make a dictionary that will contain each of the entities and 
# the total average score 
# the average of positive for each of the entities
# the average of the negative score for each of the entities
# the total number of positive and negative scores that are present for an entity

# making a list of the dictionaries
avg_ent = []
for org in sentiment.keys():
  
  # GETTING THE POSITIVE AND THE NEGATIVE FEQUENCIES
  pos_freq = len(sentiment[org]["POSITIVE"])
  neg_freq = len(sentiment[org]["NEGATIVE"])
  total_freq = pos_freq + neg_freq
  
  # finding the score for each of the positive and the negative
  score_pos = sum(sentiment[org]["POSITIVE"])
  score_neg = sum(sentiment[org]["NEGATIVE"])

  # getting the total score
  total =  score_pos - score_neg
  # checking for zero frequencies
  if pos_freq == 0:
    pos_avg = None
  else:
    pos_avg = score_pos/pos_freq
  if neg_freq == 0:
    neg_avg = None
  else:
    neg_avg = score_neg/neg_freq
  if total_freq == 0:
    avg = None
  else:
    avg = total/total_freq
  # adding all to the dictionary
  avg_ent.append(

          {
              "org": org,
              "frequency": total_freq,
              "avg": avg,
              "negative": neg_avg,
              "positive": pos_avg,


          }

  )


In [40]:
len(avg_ent)

1758

In [41]:
# making a dataframe 
sentiment_df = pd.DataFrame(avg_ent)

In [42]:
sentiment_df.head()

Unnamed: 0,org,frequency,avg,negative,positive
0,ETH,5,-0.998164,0.998164,
1,Target,5,-0.998164,0.998164,
2,BTC,14,-0.427893,0.995329,0.990696
3,FGI,5,-0.993744,0.993744,
4,CNN Money,5,-0.993744,0.993744,


In [43]:
# saving the dataframe
sentiment_df.to_csv("sentiment_full.csv", sep="|", index=False)

In [44]:
# removing the rows where the frequency is less than 3
sentiment_df = sentiment_df[sentiment_df['frequency'] > 3]

In [45]:
sentiment_df.head()

Unnamed: 0,org,frequency,avg,negative,positive
0,ETH,5,-0.998164,0.998164,
1,Target,5,-0.998164,0.998164,
2,BTC,14,-0.427893,0.995329,0.990696
3,FGI,5,-0.993744,0.993744,
4,CNN Money,5,-0.993744,0.993744,


In [46]:
sentiment_df.sort_values("avg", ascending=False).head()

Unnamed: 0,org,frequency,avg,negative,positive
129,"Roku,",5,0.999457,,0.999457
128,delivered](https://image.roku.com/c3VwcG9ydC1B...,5,0.999457,,0.999457
1094,Collective Mining,5,0.999451,,0.999451
1727,DIVO,4,0.99942,,0.99942
1726,RYLD,4,0.99942,,0.99942


In [47]:
sentiment_df.sort_values("avg", ascending=True).head()

Unnamed: 0,org,frequency,avg,negative,positive
183,ABBV,5,-0.999998,0.999998,
280,EMB,5,-0.999996,0.999996,
130,Vanguard LifeStrategy Growth Fund Investor,5,-0.999995,0.999995,
81,MRNA,5,-0.999995,0.999995,
82,Pfizer,5,-0.999995,0.999995,


In [48]:
# saving the sentiment dataframe
sentiment_df.to_csv("sentiment.csv", sep="|", index=False)