In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/My Drive/Colab Notebooks/LIPNLP

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/Colab Notebooks/LIPNLP


In [2]:
import pandas as pd
import numpy as np
import os
import string

import matplotlib.pyplot as plt
plt.style.use('seaborn')

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!




Loading news dataset,

In [4]:
news = pd.read_csv('news/apple_news.csv')

#replace NAN with 'Hello World'
news = news.replace(np.nan, 'HELLO WORLD', regex=True)

#convert to datetime
news['date'] = pd.to_datetime(news['date'])
news.head()

Unnamed: 0,date,news
0,2006-12-01,WHAT'S ON TONIGHT : 8 P.M. (TLC) ASHLEY JUDD A...
1,2006-12-04,More on Housing Prices : The broadest governme...
2,2006-12-05,HELLO WORLD
3,2006-12-06,Honoring R.W. Apple in Words and Food : About ...
4,2006-12-07,"Homebuilders, and Worries Over Jobs, Lead a De..."


In [5]:
#nltk sentiment analyser
sia = SentimentIntensityAnalyzer()

#perform sentiment analysis and get it into a list
sentiment = [sia.polarity_scores(line) for line in news['news']]

#filter the compound values only
news['polarity'] = [sentiment[r]['compound'] for r in range(len(sentiment))]
news.head()

Unnamed: 0,date,news,polarity
0,2006-12-01,WHAT'S ON TONIGHT : 8 P.M. (TLC) ASHLEY JUDD A...,0.7707
1,2006-12-04,More on Housing Prices : The broadest governme...,0.872
2,2006-12-05,HELLO WORLD,0.0
3,2006-12-06,Honoring R.W. Apple in Words and Food : About ...,0.6858
4,2006-12-07,"Homebuilders, and Worries Over Jobs, Lead a De...",-0.6712


In [6]:
news['news'][6]

"Sales of iPods and iTunes Not Much in Sync : The numbers suggest that iPods are not driving iTunes sales as much as early supporters may have expected.Name That Source : Decades on, a Vietnam War-era mystery has been revealed.In City’s Trans Fat Ban, a Challenge Fit for a Chef : Many in the restaurant industry say they fear that they will not be able to replicate dishes that now exceed the limit on trans fats.After Nasdaq Chief's Irish Bash, a Long Hangover : Even as the Nasdaq Stock Market battles to acquire a recalcitrant London Stock Exchange, Nasdaq's chief executive is in a trans-Atlantic feud of his own over a week of medieval revelry in Ireland. According to the New York Daily News, Robert Greif...Lights! Camera! Advertising! : Squeezing Money From the Music : Major labels have begun demanding a cut of concert earnings or T-shirt, ring tone and merchandise revenue from new artists seeking record contracts. "

Remove stopwords and punctuations

In [7]:

stopwords_list = set(stopwords.words('english'))
sentences = news['news']

stop_removed = []
punc_removed = []
        
for data in sentences:
  words = word_tokenize(data)
  filtered = []
  
  filtered.append([w if w not in stopwords_list else '' for w in words])
  stop_removed.append(filtered)
  

punc_removed.extend([str(row).lower().translate(str.maketrans('', '', string.punctuation)) for row in stop_removed])
news['news_cleaned'] = punc_removed
news.head()

Unnamed: 0,date,news,polarity,news_cleaned
0,2006-12-01,WHAT'S ON TONIGHT : 8 P.M. (TLC) ASHLEY JUDD A...,0.7707,what s on tonight 8 pm tlc ashley judd and ...
1,2006-12-04,More on Housing Prices : The broadest governme...,0.872,more housing prices the broadest government ...
2,2006-12-05,HELLO WORLD,0.0,hello world
3,2006-12-06,Honoring R.W. Apple in Words and Food : About ...,0.6858,honoring rw apple words food about 1000 r...
4,2006-12-07,"Homebuilders, and Worries Over Jobs, Lead a De...",-0.6712,homebuilders worries over jobs lead declin...


In [8]:
sentences = punc_removed

print('max sequence length: ', max(len(s) for s in sentences))
print('min sequence length: ', min(len(s) for s in sentences))

s = sorted(len(s) for s in sentences)
print('median sequence length: ', s[len(s)//2])

max sequence length:  4089
min sequence length:  11
median sequence length:  1439


In [0]:
MAX_SEQUENCE_LENGTH = max(len(s) for s in sentences)
MAX_VOCAB_SIZE = 40000
EMBEDDING_DIM = 200

Loading Glove word vectors

In [10]:
word2vec = {}

with open(os.path.join ('glove.6B.%sd.txt' % EMBEDDING_DIM)) as f:
  
  for line in f:
    values = line.split()
    word = values[0]
    vec = np.array(values[1:], dtype='float32')
    word2vec[word] = vec
    
print('Found %s word vectors.' % len(word2vec))

loading word vectors..
Found 400000 word vectors.


In [0]:
#tokenisation

tokenizer = Tokenizer(num_words = MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

In [12]:
word2idx = tokenizer.word_index
print('found %d unique tokens' %len(word2idx))

found 43101 unique tokens


In [13]:
#padding the sequences

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('shape of the tensor: ', data.shape)

shape of the tensor:  (2517, 4089)


Filling pretrained embeddings

In [14]:
num_words = min(MAX_VOCAB_SIZE, len(word2idx)+1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word2idx.items():
  if i < MAX_VOCAB_SIZE:
    embedding_vector = word2vec.get(word)
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector

filling pretrained embeddings


In [15]:
embedding_matrix.shape

(40000, 200)

In [16]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.21752   , -0.11792   , -0.088778  , ...,  0.27641001,
         0.14048   ,  0.99224001],
       [-0.11359   ,  0.20144001, -0.47073999, ..., -0.42846   ,
         0.37869   , -0.52864999],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [17]:
news_vec = []
news_vec.extend([data[x] for x in range(len(data))])

news['news_vector'] = news_vec

news.head()

Unnamed: 0,date,news,polarity,news_cleaned,news_vector
0,2006-12-01,WHAT'S ON TONIGHT : 8 P.M. (TLC) ASHLEY JUDD A...,0.7707,what s on tonight 8 pm tlc ashley judd and ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2006-12-04,More on Housing Prices : The broadest governme...,0.872,more housing prices the broadest government ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2006-12-05,HELLO WORLD,0.0,hello world,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,2006-12-06,Honoring R.W. Apple in Words and Food : About ...,0.6858,honoring rw apple words food about 1000 r...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,2006-12-07,"Homebuilders, and Worries Over Jobs, Lead a De...",-0.6712,homebuilders worries over jobs lead declin...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [0]:
news_data = news[['date', 'news_cleaned', 'polarity']]

x = np.array(news_data)
y = np.array(news_data.iloc[10:2512,2:]).ravel()

In [22]:
y

array([-0.5228,  0.7059, -0.6705, ...,  0.7932, -0.6908, -0.8885])

In [19]:
data = data[10:2512,]
data.shape

(2502, 4089)

In [0]:
#saving all the processed data for future use, as numpy arrays

np.save('news_vector_data', data)

np.save('news_data', x)
np.save('news_sentiment', y)
np.save('embedding_matrix', embedding_matrix)