In [1]:
# load all necessary libraries
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('max_colwidth', 100)

In [2]:
# Import pandas
import pandas as pd

# Import warning
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Read news sentiment data
news_sentiment_data = pd.read_csv('global_headlines_df.csv')
news_sentiment_data

Unnamed: 0.1,Unnamed: 0,date,articleid,headline,compound_vader_score
0,0,1991-11-14 00:00:00+00:00,wsj_398284048,Banking Bill Negotiators Set Compromise --- Plan to Widen Banks' Entry To Securities Business Is...,0.2960
1,1,1986-06-16 00:00:00+00:00,wsj_397959018,Manager's Journal: Sniffing Out Drug Abusers Is No Quick Fix,-0.7003
2,2,2001-05-24 00:00:00+00:00,wsj_398739166,"Bank of Montreal, Royal Bank Profits Rose in 2nd Period",0.4404
3,3,1986-10-22 00:00:00+00:00,wsj_397957465,Battle Over Medical Costs Isn't Over,-0.3818
4,4,2005-12-08 00:00:00+00:00,wsj_399004010,"Dow Falls 45.95, Late GM Surge Stanches Losses",-0.4019
...,...,...,...,...,...
4841,4841,2060-10-08 00:00:00+00:00,wapo_141184715,Stocks Rise for Third Straight Session: Better Sales Help Autos,0.7579
4842,4842,2052-02-11 00:00:00+00:00,wapo_152454634,"Sawyer Sees Strong Economy For 2 Years, Truce or Not",0.5106
4843,4843,2009-12-13 00:00:00+00:00,wapo_410349039,Oil's losses are airlines' gains,-0.0772
4844,4844,2009-12-18 00:00:00+00:00,wapo_410346237,Full Senate to vote on Bernanke; PANEL ADVANCES RENOMINATION Sharp debate hints at difficult con...,-0.3612


In [4]:
news_sentiment_data.loc[news_sentiment_data['compound_vader_score'] < 0, 'sentiment_class'] = '0'
news_sentiment_data.loc[news_sentiment_data['compound_vader_score'] > 0, 'sentiment_class'] = '1'

In [5]:
news_sentiment_data.head()

Unnamed: 0.1,Unnamed: 0,date,articleid,headline,compound_vader_score,sentiment_class
0,0,1991-11-14 00:00:00+00:00,wsj_398284048,Banking Bill Negotiators Set Compromise --- Plan to Widen Banks' Entry To Securities Business Is...,0.296,1
1,1,1986-06-16 00:00:00+00:00,wsj_397959018,Manager's Journal: Sniffing Out Drug Abusers Is No Quick Fix,-0.7003,0
2,2,2001-05-24 00:00:00+00:00,wsj_398739166,"Bank of Montreal, Royal Bank Profits Rose in 2nd Period",0.4404,1
3,3,1986-10-22 00:00:00+00:00,wsj_397957465,Battle Over Medical Costs Isn't Over,-0.3818,0
4,4,2005-12-08 00:00:00+00:00,wsj_399004010,"Dow Falls 45.95, Late GM Surge Stanches Losses",-0.4019,0


In [6]:
news_sentiment_data.isnull().sum()

Unnamed: 0              0
date                    0
articleid               0
headline                0
compound_vader_score    0
sentiment_class         0
dtype: int64

In [7]:
# store sentiment_class in y
y = news_sentiment_data.sentiment_class
y.head()

0    1
1    0
2    1
3    0
4    0
Name: sentiment_class, dtype: object

In [8]:
# Store news headlines in X
X = news_sentiment_data.headline

# Convert X in string if the value of x is not string
X = [str(x) if type(x) != str else x for x in X]

In [9]:
test_ratio = 0.2
train_ratio = 1.0 - test_ratio

num_train = int(train_ratio * len(X))

In [10]:
num_train

3876

In [11]:
# so far nothing concrete about it
num_test = int(test_ratio * len(X))

In [12]:
num_test

969

In [13]:
# X_train and Y_train are training dataset. X_test and Y_test are testing dataset.
X_train = X[:num_train]
y_train = y[:num_train]
X_test = X[num_train:]
y_test = y[num_train:]

In [14]:
X_train

["Banking Bill Negotiators Set Compromise --- Plan to Widen Banks' Entry To Securities Business Is Dropped as Vote Nears",
 "Manager's Journal: Sniffing Out Drug Abusers Is No Quick Fix",
 'Bank of Montreal, Royal Bank Profits Rose in 2nd Period',
 "Battle Over Medical Costs Isn't Over",
 'Dow Falls 45.95, Late GM Surge Stanches Losses',
 'U.S., week ahead of economic summit, signals it wants lower interest rates',
 'Net Worth, Not Income, Is Right Measure of Inequality',
 'The Americas: Ecuador Struggles Toward a Dollar Economy',
 'Dollar Declines as Players Take Profits From Rally and After Fed Boosts Rates',
 'In Europe, Job Protections for Older Generation Are Barriers for Younger Workers; Earnings Gap Looms for Younger Generation Dependent on Short-Term Contracts',
 'Outlook for Business Investment Improves, but Softness May Persist',
 'Tech Sector in Hiring Drive; Google, Intel Add Workers as Profits Snap Back; Start-Ups Also Fight for Talent',
 "Fed's Greenspan Refuses to Accept

In [15]:
y_train

0       1
1       0
2       1
3       0
4       0
       ..
3871    1
3872    1
3873    1
3874    1
3875    1
Name: sentiment_class, Length: 3876, dtype: object

In [16]:
X_test

["Films Going 'On Location' Helping Some U.S. Cities",
 'Iraq to Pay Damages For Attack; U.S. Senate Votes To Require Briefing On Dangers in Gulf',
 'Greenspan Notes Weak Recovery; Prospect Is Raised For Interest Rate Cut',
 'Steel Shares Weak, General List Irregular: Trading at 3-Month Low',
 '31 Large Cities Have Substantial Unemployment',
 'Recession Unheard of In Daytona',
 'Slow Job Growth Said to Foreshadow Rate Cut: Many Analysts Expect...',
 'TRENDLINES: Fighting Inflation Looking More Closely at Current-Dollar Growth',
 "Mobil Reports Flat Earnings in '91, With 1992 'Not Off to a Good Start",
 'Flirting With Deficit Disasters',
 'Most Argentines Back Their President, Not Debt',
 'Berlin Crisis Blamed',
 'Weapon Overruns Are Laid to Haste, Inflation, Errors',
 "Clinton's Push for Paid Parental Leave Falls Flat in States",
 "Suburbs 'Sit Up and Take Notice' of Hispanics' Grievances",
 'Praise, Wariness Greet Controls Shift',
 "Battle for Nixon's Mind",
 'Steels, Rails Spark Rene

In [17]:
y_test

3876    1
3877    0
3878    1
3879    0
3880    0
       ..
4841    1
4842    1
4843    0
4844    0
4845    1
Name: sentiment_class, Length: 970, dtype: object

#### This step is converting train and text dataset into vectors using TF-IDF method.

In [18]:
# Import TfidfVectorizer from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer and required arguments to process the data
tfidf_vectorizer = TfidfVectorizer(
    smooth_idf=False, use_idf=True, stop_words='english', lowercase=True)

# Fit and transform the model on train dataset
X_new_train = tfidf_vectorizer.fit_transform(X_train)

# Transform the test dataset
X_new_test = tfidf_vectorizer.transform(X_test)

### Random Forest

In [20]:
from sklearn.metrics import accuracy_score

# Random Forest
from sklearn.ensemble import RandomForestClassifier

In [21]:
rfm = RandomForestClassifier(n_estimators=70, oob_score=True, n_jobs=-1, random_state=101, max_features=None, min_samples_leaf=30)

In [22]:
rfm.fit(X_new_train, y_train)

RandomForestClassifier(max_features=None, min_samples_leaf=30, n_estimators=70,
                       n_jobs=-1, oob_score=True, random_state=101)

In [23]:
prediction = rfm.predict(X_new_test)

In [24]:
print(accuracy_score(y_test,prediction))

0.6793814432989691


In [25]:
from sklearn.metrics import confusion_matrix 
print(confusion_matrix(y_test, prediction))

[[269 213]
 [ 98 390]]


In [26]:
from sklearn.metrics import classification_report 
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.73      0.56      0.63       482
           1       0.65      0.80      0.71       488

    accuracy                           0.68       970
   macro avg       0.69      0.68      0.67       970
weighted avg       0.69      0.68      0.67       970



### SVM

In [30]:
from sklearn.svm import SVC
svm = SVC(kernel="linear", C=0.025, random_state=101)


In [31]:
svm.fit(X_new_train, y_train)

SVC(C=0.025, kernel='linear', random_state=101)

In [32]:
prediction = svm.predict(X_new_test)

In [33]:
print(accuracy_score(y_test,prediction))

0.5030927835051546


In [34]:
from sklearn.metrics import confusion_matrix 
print(confusion_matrix(y_test, prediction))

[[  0 482]
 [  0 488]]


In [35]:
from sklearn.metrics import classification_report 
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       482
           1       0.50      1.00      0.67       488

    accuracy                           0.50       970
   macro avg       0.25      0.50      0.33       970
weighted avg       0.25      0.50      0.34       970

