In [49]:
import pandas as pd
pd.options.display.max_rows = 999

In [48]:
news_info = [
    ("Number of deaths in USA have decreased", True),
    ("Number of deaths in USA have increased", False),
    ("Floyd was killed last month", False),
    ("People protest for justice", None), #True?
    ("USA won the last game", True),
    ("The food at Radison is very good", True),
    ("George Floyd's loved ones say they hope his funeral is only the beginning of widespread change", None), #True?
    ("What to Know About Calls to Defund the Police in California", False),
    ("19 states see rising coronavirus cases and Arizona is asking its hospitals to activate emergency plans", False),
    ("‘Blood in the water’: Dems get unexpected opening against Trump in Iowa", False),
    ("Long lines in Georgia, Lindsey Graham wards off GOP rivals in SC and other takeaways from Tuesday's primaries", False),
    ("All Black Lives Matter march will take place June 14, without L.A. Pride’s involvement", None), #True?
    ("One blood type may provide some coronavirus protection, study suggests", True),
    ("Watch Tesla Model S P100D Race Ferrari F12: Laughable Results", None), #False
    ("Harvard researcher says the most emotionally intelligent people have these 12 traits. Which do you have?", None), #False
    ("Judge issues order halting Lee statue removal for 10 days", None), #True
    ("Northern Virginia to ease coronavirus restrictions Friday; Northam says students will return to school this fall", True),
    ("Trump was talked out of firing Esper last week: report", None), #True
    ("Human remains found at property tied to Chad Daybell, Lori Vallow and missing kids", False),
    ("North Korean leader's sister emerges as policymaker in spat with South Korea", False),
    ("Banksy proposes a new statue that will make 'everyone happy'", True),
    ("Russia will open nuclear disarmament talks with US", True),
    ("George Floyd: Twitter to make Juneteenth a company holiday", True),
    ("Google Meet’s background noise cancellation feature is pretty amazing", True),
    ("‘Vanderpump Rules’ Fires Stassi Schroeder and Kristen Doute For Racist Actions", False),
    ("NASA astronaut from historic spacewalk becomes first woman to reach deepest point in ocean", True),
    ("Americans misusing bleach to fight coronavirus, CDC says", False),
    ("WHO’s chief scientist says there’s a ‘very real risk’ of a second wave of coronavirus as economies reopen", False),
    ("Anthony Fauci warns that ‘nightmare’ pandemic isn’t close to over", False),
    ("Anthony Fauci says that pandemic is close to over", True),
    ("Governament reopens schools", True)
]

In [3]:
news = [cur_news_info[0] for cur_news_info in news_info]
news[0:2]

['Number of deaths in USA have decreased',
 'Number of deaths in USA have increased']

In [4]:
expected = [cur_news_info[1] for cur_news_info in news_info]
expected[0:2]

[True, False]

# Text Blob

In [5]:
from textblob import TextBlob

In [6]:
received_textblob = [TextBlob(text).sentiment for text in news]

In [7]:
df = pd.DataFrame(list(zip(news, expected, received_textblob)), columns=['News', 'Expected', 'TextBlob: Polarity / Subjectivity'])

In [10]:
df.head()

Unnamed: 0,News,Expected,TextBlob: Polarity / Subjectivity
0,Number of deaths in USA have decreased,True,"(-0.4, 0.7)"
1,Number of deaths in USA have increased,False,"(0.0, 0.0)"
2,Floyd was killed last month,False,"(-0.1, 0.03333333333333333)"
3,People protest for justice,,"(0.0, 0.0)"
4,USA won the last game,True,"(-0.2, 0.23333333333333334)"


In [11]:
df['blob_min_0'] = df.apply(lambda x: x['TextBlob: Polarity / Subjectivity'][0]>0 and x['TextBlob: Polarity / Subjectivity'][1]<0.8,axis=1)
df['blob_min_0.25'] = df.apply(lambda x: x['TextBlob: Polarity / Subjectivity'][0]>0.25 and x['TextBlob: Polarity / Subjectivity'][1]<0.8,axis=1)
df['blob_min_0.15'] = df.apply(lambda x: x['TextBlob: Polarity / Subjectivity'][0]>0.15 and x['TextBlob: Polarity / Subjectivity'][1]<0.8,axis=1)

In [12]:
df['all'] = 1
df.head()

Unnamed: 0,News,Expected,TextBlob: Polarity / Subjectivity,blob_min_0,blob_min_0.25,blob_min_0.15,all
0,Number of deaths in USA have decreased,True,"(-0.4, 0.7)",False,False,False,1
1,Number of deaths in USA have increased,False,"(0.0, 0.0)",False,False,False,1
2,Floyd was killed last month,False,"(-0.1, 0.03333333333333333)",False,False,False,1
3,People protest for justice,,"(0.0, 0.0)",False,False,False,1
4,USA won the last game,True,"(-0.2, 0.23333333333333334)",False,False,False,1


In [13]:
df.groupby(['Expected', 'blob_min_0'])[['all']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,all
Expected,blob_min_0,Unnamed: 2_level_1
False,False,11
False,True,1
True,False,9
True,True,3


In [14]:
df[(df['Expected']==False) & (df['blob_min_0']==True)]

Unnamed: 0,News,Expected,TextBlob: Polarity / Subjectivity,blob_min_0,blob_min_0.25,blob_min_0.15,all
27,WHO’s chief scientist says there’s a ‘very rea...,False,"(0.13, 0.19500000000000003)",True,False,False,1


In [15]:
df.groupby(['Expected', 'blob_min_0.25'])[['all']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,all
Expected,blob_min_0.25,Unnamed: 2_level_1
False,False,12
True,False,10
True,True,2


In [16]:
df.groupby(['Expected', 'blob_min_0.15'])[['all']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,all
Expected,blob_min_0.15,Unnamed: 2_level_1
False,False,12
True,False,10
True,True,2


# NLP

In [17]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

In [19]:
sia = SIA()
pol_score = [sia.polarity_scores(text) for text in news]
df['ntl'] = [elem for elem in pol_score]

In [22]:
pol_score[0:5]

[{'neg': 0.0, 'neu': 0.822, 'pos': 0.178, 'compound': 0.0772},
 {'neg': 0.0, 'neu': 0.595, 'pos': 0.405, 'compound': 0.34},
 {'neg': 0.529, 'neu': 0.471, 'pos': 0.0, 'compound': -0.6705},
 {'neg': 0.27, 'neu': 0.27, 'pos': 0.459, 'compound': 0.34},
 {'neg': 0.0, 'neu': 0.519, 'pos': 0.481, 'compound': 0.5719}]

In [34]:
df['ntl_min_0'] = [elem['pos']>0 for elem in pol_score]
df['ntl_min_0.15'] = [elem['pos']>0 for elem in pol_score]
df['ntl_min_0.25'] = [elem['pos']>0 for elem in pol_score]
df['ntl_min_0.30'] = [elem['pos']>0 for elem in pol_score]
df['ntl_min_0.40'] = [elem['pos']>0 for elem in pol_score]
df['ntl_min_0.50'] = [elem['pos']>0 for elem in pol_score]
df['ntl_min_0.60'] = [elem['pos']>0 for elem in pol_score]
df.head(1)
# WHATS THAT? x.compound[0]

Unnamed: 0,News,Expected,TextBlob: Polarity / Subjectivity,blob_min_0,blob_min_0.25,blob_min_0.15,all,ntl_min_0,ntl,ntl_min_0.15,ntl_min_0.25,ntl_min_0.30,ntl_min_0.40,ntl_min_0.50,ntl_min_0.60
0,Number of deaths in USA have decreased,True,"(-0.4, 0.7)",False,False,False,1,True,"{'neg': 0.0, 'neu': 0.822, 'pos': 0.178, 'comp...",True,True,True,True,True,True


In [35]:
df.groupby(['Expected', 'ntl_min_0'])[['all']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,all
Expected,ntl_min_0,Unnamed: 2_level_1
False,False,11
False,True,1
True,False,4
True,True,8


In [36]:
df.groupby(['Expected', 'ntl_min_0.15'])[['all']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,all
Expected,ntl_min_0.15,Unnamed: 2_level_1
False,False,11
False,True,1
True,False,4
True,True,8


In [37]:
df.groupby(['Expected', 'ntl_min_0.25'])[['all']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,all
Expected,ntl_min_0.25,Unnamed: 2_level_1
False,False,11
False,True,1
True,False,4
True,True,8


In [38]:
df.groupby(['Expected', 'ntl_min_0.30'])[['all']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,all
Expected,ntl_min_0.30,Unnamed: 2_level_1
False,False,11
False,True,1
True,False,4
True,True,8


In [39]:
df.groupby(['Expected', 'ntl_min_0.40'])[['all']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,all
Expected,ntl_min_0.40,Unnamed: 2_level_1
False,False,11
False,True,1
True,False,4
True,True,8


In [40]:
df.groupby(['Expected', 'ntl_min_0.50'])[['all']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,all
Expected,ntl_min_0.50,Unnamed: 2_level_1
False,False,11
False,True,1
True,False,4
True,True,8


In [41]:
df.groupby(['Expected', 'ntl_min_0.60'])[['all']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,all
Expected,ntl_min_0.60,Unnamed: 2_level_1
False,False,11
False,True,1
True,False,4
True,True,8


In [43]:
df[(df['Expected']==False) & (df['ntl_min_0']==True)]['News']

1    Number of deaths in USA have increased
Name: News, dtype: object

# spaCy

# Comparison
Pros and cons: https://www.softkraft.co/python-nlp-libraries-features-us-cases-pros-and-cons/
#### Summary
- NTLK: supports more languages and most famous NLP library
- TextBlob: great library for getting started
- spaCy: Lightning-fast and Gets Things Done!
- Pattern: data mining, scraping, NLP, ML, ("All-in-One")

ps: Gensim = to use NTLK in cluster (and very fast)

In [50]:
df

Unnamed: 0,News,Expected,TextBlob: Polarity / Subjectivity,blob_min_0,blob_min_0.25,blob_min_0.15,all,ntl_min_0,ntl,ntl_min_0.15,ntl_min_0.25,ntl_min_0.30,ntl_min_0.40,ntl_min_0.50,ntl_min_0.60
0,Number of deaths in USA have decreased,True,"(-0.4, 0.7)",False,False,False,1,True,"{'neg': 0.0, 'neu': 0.822, 'pos': 0.178, 'comp...",True,True,True,True,True,True
1,Number of deaths in USA have increased,False,"(0.0, 0.0)",False,False,False,1,True,"{'neg': 0.0, 'neu': 0.595, 'pos': 0.405, 'comp...",True,True,True,True,True,True
2,Floyd was killed last month,False,"(-0.1, 0.03333333333333333)",False,False,False,1,False,"{'neg': 0.529, 'neu': 0.471, 'pos': 0.0, 'comp...",False,False,False,False,False,False
3,People protest for justice,,"(0.0, 0.0)",False,False,False,1,True,"{'neg': 0.27, 'neu': 0.27, 'pos': 0.459, 'comp...",True,True,True,True,True,True
4,USA won the last game,True,"(-0.2, 0.23333333333333334)",False,False,False,1,True,"{'neg': 0.0, 'neu': 0.519, 'pos': 0.481, 'comp...",True,True,True,True,True,True
5,The food at Radison is very good,True,"(0.9099999999999999, 0.7800000000000001)",True,True,True,1,True,"{'neg': 0.0, 'neu': 0.653, 'pos': 0.347, 'comp...",True,True,True,True,True,True
6,George Floyd's loved ones say they hope his fu...,,"(0.35, 0.9)",False,False,False,1,True,"{'neg': 0.112, 'neu': 0.583, 'pos': 0.305, 'co...",True,True,True,True,True,True
7,What to Know About Calls to Defund the Police ...,False,"(0.0, 0.0)",False,False,False,1,False,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",False,False,False,False,False,False
8,19 states see rising coronavirus cases and Ari...,False,"(0.0, 0.0)",False,False,False,1,False,"{'neg': 0.148, 'neu': 0.852, 'pos': 0.0, 'comp...",False,False,False,False,False,False
9,‘Blood in the water’: Dems get unexpected open...,False,"(0.1, 1.0)",False,False,False,1,False,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",False,False,False,False,False,False


In [60]:
df.groupby('Expected')[['blob_min_0']].mean()

Unnamed: 0_level_0,blob_min_0
Expected,Unnamed: 1_level_1
False,0.083333
True,0.25


In [61]:
df.groupby('Expected')[['ntl_min_0.30']].mean()

Unnamed: 0_level_0,ntl_min_0.30
Expected,Unnamed: 1_level_1
False,0.083333
True,0.666667


# Training my Own Sentiment Analyser with the best library
Using the library with most false negatives and most true positives.