In [1]:
import os
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import bz2
import json
import nltk

Mounted at /content/drive
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [2]:
%cd "/content/drive/My Drive"
!ls

/content/drive/My Drive
 ada_project	    Doomed-to-learn   Quotebank
'Colab Notebooks'   lego_project      Routine.xlsx


In [3]:
def create_dataframe_from_json_bz2(path_file):
    with bz2.open(path_file, 'rb') as file:
        df = pd.read_json(file, lines=True)
    return df

In [4]:
years = ['2018', '2019', '2020']
data = [create_dataframe_from_json_bz2('ada_project/quotes-{}-china_trade.json.bz2'.format(year)) for year in years]
df = pd.concat(data, ignore_index=True)
df.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
0,2018-04-13-001441,A Digital Free Trade Zone between China and Ma...,Peter Wong,"[Q7177768, Q7177774, Q7177776, Q9456225]",2018-04-13 19:30:55,1,"[[Peter Wong, 0.782], [None, 0.218]]",[http://nst.com.my/business/2018/04/356893/chi...,E
1,2018-03-23-003097,A rough week for the markets... as fears of a ...,,[],2018-03-23 10:28:51,2,"[[None, 0.9112], [President Donald Trump, 0.08...",[http://www.breitbart.com/news/world-stock-mar...,E
2,2018-04-08-011525,"Every day of the week China, comes into our ho...",Peter Navarro,[Q7176052],2018-04-08 04:00:00,17,"[[Peter Navarro, 0.6696], [None, 0.208], [LARR...",[http://dailylocal.com/general-news/20180408/a...,E
3,2018-05-14-023366,For the President to become suddenly concerned...,Jonathan Fenby,[Q15072639],2018-05-14 20:43:32,2,"[[Jonathan Fenby, 0.8834], [None, 0.1166]]",[https://www.fxstreet.com/news/wall-street-dow...,E
4,2018-11-29-030995,"he would be able to strike a chord with Xi, wh...",A. Khan,[Q54946635],2018-11-29 07:11:20,1,"[[A. Khan, 0.6289], [Narendra Modi, 0.204], [N...",[http://asia.nikkei.com/Politics/International...,E


In [6]:
df.drop(['qids', 'probas', 'urls', 'phase'], axis=1, inplace=True)
df.date = df.date.dt.strftime('%d/%m/%Y')
df.head()

Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences
0,2018-04-13-001441,A Digital Free Trade Zone between China and Ma...,Peter Wong,13/04/2018,1
1,2018-03-23-003097,A rough week for the markets... as fears of a ...,,23/03/2018,2
2,2018-04-08-011525,"Every day of the week China, comes into our ho...",Peter Navarro,08/04/2018,17
3,2018-05-14-023366,For the President to become suddenly concerned...,Jonathan Fenby,14/05/2018,2
4,2018-11-29-030995,"he would be able to strike a chord with Xi, wh...",A. Khan,29/11/2018,1


In [7]:
nltk.download("vader_lexicon")
from nltk.sentiment.vader import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

def compute_vader_sentiment(quotation):
    scores = analyzer.polarity_scores(quotation)
    if scores["neg"] > scores["pos"]:
        return -1
    return 1

df['vader_sentiment'] = df['quotation'].apply(lambda x: compute_vader_sentiment(x))
df.head()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...




Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,vader_sentiment
0,2018-04-13-001441,A Digital Free Trade Zone between China and Ma...,Peter Wong,13/04/2018,1,1
1,2018-03-23-003097,A rough week for the markets... as fears of a ...,,23/03/2018,2,-1
2,2018-04-08-011525,"Every day of the week China, comes into our ho...",Peter Navarro,08/04/2018,17,1
3,2018-05-14-023366,For the President to become suddenly concerned...,Jonathan Fenby,14/05/2018,2,1
4,2018-11-29-030995,"he would be able to strike a chord with Xi, wh...",A. Khan,29/11/2018,1,-1


In [8]:
df.to_json('ada_project/china_trade_vader_sentiment.json', orient='records', lines=True)