In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('global_headlines_df.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,date,articleid,headline,compound_vader_score
0,0,1991-11-14 00:00:00+00:00,wsj_398284048,Banking Bill Negotiators Set Compromise --- Pl...,0.296
1,1,1986-06-16 00:00:00+00:00,wsj_397959018,Manager's Journal: Sniffing Out Drug Abusers I...,-0.7003
2,2,2001-05-24 00:00:00+00:00,wsj_398739166,"Bank of Montreal, Royal Bank Profits Rose in 2...",0.4404
3,3,1986-10-22 00:00:00+00:00,wsj_397957465,Battle Over Medical Costs Isn't Over,-0.3818
4,4,2005-12-08 00:00:00+00:00,wsj_399004010,"Dow Falls 45.95, Late GM Surge Stanches Losses",-0.4019


In [3]:
data = data.loc[:,['headline']]

In [4]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

cs = []
for row in range(len(data)):
    cs.append(analyzer.polarity_scores(data['headline'].iloc[row])['compound'])

data['compound_vader_score'] = cs
data = data[(data[['compound_vader_score']] != 0).all(axis=1)].reset_index(drop=True)

data

[nltk_data] Downloading package vader_lexicon to C:\Users\Saurabh
[nltk_data]     Kamal\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,headline,compound_vader_score
0,Banking Bill Negotiators Set Compromise --- Pl...,0.2960
1,Manager's Journal: Sniffing Out Drug Abusers I...,-0.7003
2,"Bank of Montreal, Royal Bank Profits Rose in 2...",0.4404
3,Battle Over Medical Costs Isn't Over,-0.3818
4,"Dow Falls 45.95, Late GM Surge Stanches Losses",-0.4019
...,...,...
4841,Stocks Rise for Third Straight Session: Better...,0.7579
4842,"Sawyer Sees Strong Economy For 2 Years, Truce ...",0.5106
4843,Oil's losses are airlines' gains,-0.0772
4844,Full Senate to vote on Bernanke; PANEL ADVANCE...,-0.3612


In [5]:
data.loc[data['compound_vader_score'] < 0, 'sentiment_class'] = '0'
data.loc[data['compound_vader_score'] > 0, 'sentiment_class'] = '1'

In [6]:
data.dtypes

headline                 object
compound_vader_score    float64
sentiment_class          object
dtype: object

In [7]:
data['sentiment_class'] = data['sentiment_class'].astype('int64')

In [8]:
data.drop(['compound_vader_score'],axis=1,inplace=True)

In [9]:
data

Unnamed: 0,headline,sentiment_class
0,Banking Bill Negotiators Set Compromise --- Pl...,1
1,Manager's Journal: Sniffing Out Drug Abusers I...,0
2,"Bank of Montreal, Royal Bank Profits Rose in 2...",1
3,Battle Over Medical Costs Isn't Over,0
4,"Dow Falls 45.95, Late GM Surge Stanches Losses",0
...,...,...
4841,Stocks Rise for Third Straight Session: Better...,1
4842,"Sawyer Sees Strong Economy For 2 Years, Truce ...",1
4843,Oil's losses are airlines' gains,0
4844,Full Senate to vote on Bernanke; PANEL ADVANCE...,0


In [10]:
# lets perform this to our dataset
# First, we need to split our data into train and test

from sklearn.model_selection import train_test_split
review = data['headline'].values
label = data['sentiment_class'].values

In [11]:
review

array(["Banking Bill Negotiators Set Compromise --- Plan to Widen Banks' Entry To Securities Business Is Dropped as Vote Nears",
       "Manager's Journal: Sniffing Out Drug Abusers Is No Quick Fix",
       'Bank of Montreal, Royal Bank Profits Rose in 2nd Period', ...,
       "Oil's losses are airlines' gains",
       'Full Senate to vote on Bernanke; PANEL ADVANCES RENOMINATION Sharp debate hints at difficult confirmation',
       'Reinventing Opportunities'], dtype=object)

In [12]:
label

array([1, 0, 1, ..., 0, 0, 1], dtype=int64)

In [13]:
review_train, review_test, label_train, label_test = train_test_split(review, label, test_size=0.25, random_state=1000)

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
review_vectorizer = TfidfVectorizer(smooth_idf=False, use_idf=True, stop_words='english', lowercase=True)
review_vectorizer.fit(review_train)
Xlr_train = review_vectorizer.transform(review_train)
Xlr_test  = review_vectorizer.transform(review_test)
Xlr_train
                                    
                                    

<3634x6659 sparse matrix of type '<class 'numpy.float64'>'
	with 30697 stored elements in Compressed Sparse Row format>

### Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression
LRmodel = LogisticRegression()
LRmodel.fit(Xlr_train, label_train)
score = LRmodel.score(Xlr_test, label_test)
print("Accuracy:", score)

Accuracy: 0.7978547854785478


### CNN

In [16]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(review_train)
Xcnn_train = tokenizer.texts_to_sequences(review_train)
Xcnn_test = tokenizer.texts_to_sequences(review_test)
vocab_size = len(tokenizer.word_index) + 1
print(review_train[1])
print(Xcnn_train[1])

Dollar Ends Mixed As Action by Fed Eagerly Awaited ---- Special to The Wall Street Journal
[25, 440, 142, 11, 418, 21, 14, 3502, 1471, 116, 1, 3, 70, 69, 240]


In [17]:
from keras.preprocessing.sequence import pad_sequences
maxlen = 100
Xcnn_train = pad_sequences(Xcnn_train, padding='post', maxlen=maxlen)
Xcnn_test = pad_sequences(Xcnn_test, padding='post', maxlen=maxlen)
print(Xcnn_train[0, :])

[   3 2352 2353 3501  853    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


In [18]:
from keras.models import Sequential
from keras import layers 

In [19]:
# making models using layers in it.

embedding_dim = 200
textcnnmodel = Sequential()
textcnnmodel.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
textcnnmodel.add(layers.Conv1D(128, 5, activation='relu'))
textcnnmodel.add(layers.GlobalMaxPooling1D())
textcnnmodel.add(layers.Dense(10, activation='relu'))
textcnnmodel.add(layers.Dense(1, activation='sigmoid'))
textcnnmodel.compile(optimizer='adam',
               loss='binary_crossentropy',
               metrics=['accuracy'])
textcnnmodel.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 200)          1470000   
_________________________________________________________________
conv1d (Conv1D)              (None, 96, 128)           128128    
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 10)                1290      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 1,599,429
Trainable params: 1,599,429
Non-trainable params: 0
_________________________________________________________________


In [20]:
## let's fit the model and checking for accuracy

textcnnmodel.fit(Xcnn_train, label_train,
                epochs=10,
                verbose=False,
                validation_data=(Xcnn_test, label_test), 
                batch_size=10)
loss, accuracy = textcnnmodel.evaluate(Xcnn_train, label_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = textcnnmodel.evaluate(Xcnn_test, label_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))


Training Accuracy: 1.0000
Testing Accuracy: 0.8053
