# Import packages

1) Install the following:
- tqdm
- spaCy
- python -m spacy download en_core_web_sm

These should all be installed with the bash script that you attached when starting the app in UCloud.

In [3]:
# Data analysis
import os
import pandas as pd
from collections import Counter
from tqdm import tqdm

# NLP
import spacy
nlp = spacy.load("en_core_web_sm")

# sentiment analysis VADER
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
# sentiment with spacyTextBlob
from spacytextblob.spacytextblob import SpacyTextBlob
nlp.add_pipe('spacytextblob')

# visualisations
import matplotlib.pyplot as plt

# Load the data

We're going to use the "fake news" dataset in the shared drive today.

In [12]:
# get the filepath
filepath = os.path.join('..','..','..','CDS-LANG', 'tabular_examples', 'fake_or_real_news.csv')

In [17]:
# load the data
data = pd.read_csv(filepath)

In [18]:
data

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


# Sentiment Analysis with VADER

Let's look at text at index 3 in our data - we'll just work with the headlines, rather than the full articles.

In [22]:
print(data['title'][3])

Bernie supporters on Twitter erupt in anger against the DNC: 'We tried to warn you!'


In [24]:
text_example = data['title'][3]
analyzer.polarity_scores(text_example)

{'neg': 0.266, 'neu': 0.591, 'pos': 0.143, 'compound': -0.3595}

In [23]:
# get sentiment scores with VADER
analyzer.polarity_scores(data['title'][3])

{'neg': 0.266, 'neu': 0.591, 'pos': 0.143, 'compound': -0.3595}

In [27]:
# get for all headlines
vader_score = []

for headline in data['title']:
    dict_of_scores = analyzer.polarity_scores(headline)
    vader_score.append(dict_of_scores)
    
df['vader_sentiment'] = vader_score

In [29]:
vader_df = pd.DataFrame(vader_score, columns = ['neg','neu','pos','compound'])

In [30]:
# create a dataframe
vader_df

Unnamed: 0,neg,neu,pos,compound
0,0.444,0.556,0.000,-0.4939
1,0.242,0.645,0.113,-0.5267
2,0.000,0.762,0.238,0.3612
3,0.266,0.591,0.143,-0.3595
4,0.243,0.654,0.103,-0.3612
...,...,...,...,...
6330,0.000,1.000,0.000,0.0000
6331,0.000,1.000,0.000,0.0000
6332,0.192,0.808,0.000,-0.2263
6333,0.000,0.479,0.521,0.8271


In [32]:
# display
vader_df['compound'].describe()

count    6335.000000
mean       -0.086533
std         0.384484
min        -0.943200
25%        -0.381800
50%         0.000000
75%         0.000000
max         0.977600
Name: compound, dtype: float64

# Sentiment analysis with ```spaCyTextBlob```

A slightly different approach uses ```TextBlob``` via ```spaCy``` to do the sentiment analysis. To do this, we then need to first use our ```nlp()``` pipeline to create a ```Doc``` for each headline. The sentiment scores can then be found as attributes of each ```Doc```.

In [35]:
# get scores for one doc
test_doc = nlp(df['title'][3])
test_doc = nlp(text_example)

In [36]:
test_doc

Bernie supporters on Twitter erupt in anger against the DNC: 'We tried to warn you!'

In [37]:
test_doc._.blob.polarity

-0.875

In [38]:
test_doc._.blob.subjectivity # goes from 0 (objective) to 1 (subjective)

0.2

In [41]:
test_doc._.blob.sentiment_assessments.assessments

[(['anger', '!'], -0.875, 0.2, None)]

In [42]:
# get polarity scores for all headlines
polarity_score = []

for headline in data['title']:
    doc = nlp(headline) # slow, because of this right here
    scores = doc._.blob.polarity 
    blob_score.append(scores)
    

In [46]:
# get subjectivity scores
blob_sub_score = []

for headline in data['title']:
    doc = nlp(headline)
    scores = doc._.blob.subjectivity 
    blob_sub_score.append(scores)
    

##### A better way of doing this with spacy

In [47]:
# only do this nlp thing once
docs = list(nlp.pipe(data.title))

# notes: beware that sometimes these are processed in parallel, and therefore not sequentially. 
#the index may therefore not be trustworthy. it may shuffle the data.

In [52]:
subjectivity_score = []
polarity_score = []

pol_subj = []

for doc in docs:
    sub_scores = doc._.blob.subjectivity
    subjectivity_score.append(sub_scores)
    
    pol_scores = doc._.blob.polarity
    polarity_score.append(pol_scores)
    
    # append as tuple to list, because they are immutable. this is a good practice method
    pol_subj.append((pol_scores, sub_scores))

In [53]:
textblob_df = pd.DataFrame(pol_subj, columns = ['polarity','subjectivity'])

In [54]:
textblob_df

Unnamed: 0,polarity,subjectivity
0,0.000000,0.000000
1,0.125000,0.175000
2,0.000000,0.000000
3,-0.875000,0.200000
4,0.268182,0.477273
...,...,...
6330,0.000000,0.000000
6331,0.000000,0.000000
6332,0.000000,0.000000
6333,0.000000,0.000000


# Doing NER with ```spaCy```

We can then find every individual occurrence of some kind of named entity. Note that this returns all named entities, regardless of type.

We can also iterate through the full data set and get a similar results for every headline.

In [77]:
test_string = 'hi, i am ross. i am 100 years old and come from SCotlAnd'

In [78]:
doc = nlp(test_string)

In [79]:
for token in doc.ents:
    print(token.text, token.label_)

ross PERSON
100 years old DATE
SCotlAnd GPE


In [80]:
ents = []

for headline in tqdm(nlp.pipe(data['title'], batch_size = 500)): # use tqdm to monitor the time it takes
    for entity in headline.ents:
        if entity.label_ == 'GPE':
            ents.append((entity.text, entity.label_))
    
    

6335it [00:09, 702.66it/s] 


In [82]:
ents_df = pd.DataFrame(ents, columns = ['location', 'label'] )

In [84]:
ents_df['location'].value_counts()

US                 164
Obama              141
Russia             116
Iran               104
America             85
                  ... 
New York Debate      1
Guatemala            1
Iceland              1
Washington DC        1
East Africa          1
Name: location, Length: 338, dtype: int64

## We can also work for a longer text such as a novel, rather than a number of short texts.

In [None]:
# get filename
filepath = os.path.join('..','..','..','CDS-LANG', '100_novels', 'fake_or_real_news.csv')

In [None]:
# load 


In [None]:
# get spacy Doc


In [None]:
# sentence tokenization


# Plotting sentiment over time

In [None]:
# plot polarity


In [None]:
# smooth with a rolling window


In [None]:
# plot the results
