In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

## Data Pre-Processing

In [None]:
df = pd.read_csv('/kaggle/input/stock-news-headlines/Data.csv')
df.head()

In [None]:
df.shape

In [None]:
# seperate training and test data
train = df[df['Date'] < '20150101']
test = df[df['Date'] > '20141231']

In [None]:
train.shape

In [None]:
# set up training data
data = train.iloc[:, 2:27]
data.head()

In [None]:
data.replace('[^a-zA-Z]', ' ', inplace=True) # remove special characters
data.head()

In [None]:
data.columns

In [None]:
# update columns name
new_index = [str(i) for i in range(25)]
data.columns = new_index
data.head()

In [None]:
data.index

In [None]:
data['0']

In [None]:
# lower case
for index in new_index:
    data[index] = data[index].str.lower()
    
data.head()

## Data Wrangling and Merging

In [None]:
# print data of perticular(2nd) row
data.iloc[1, 0:25]

In [None]:
# create list of headlines of perticular date(row) for eg, 2nd row
headlines = []
for i in data.iloc[1, 0:25]:
    headlines.append(i)
    
# convert list in to string
' '.join(headlines)

# ' '.join([str(i) for i in data.iloc[1,0:25]])

In [None]:
# apply the same for all
headlines = []
for row in range(0, len(data)):
    headlines.append(' '.join([str(i) for i in data.iloc[row,0:25]]))

In [None]:
headlines[0:3]

## Bag-Of-Words

In [None]:
cv = CountVectorizer(ngram_range=(2,2))
train_data_x = cv.fit_transform(headlines)

In [None]:
train_data_x.shape

## Random Forest Classifier

In [None]:
rf = RandomForestClassifier(n_estimators=200, criterion='entropy')
rf.fit(train_data_x, train['Label'])

In [None]:
# test data
test_headlines = []

for row in range(0, len(test)):
    test_headlines.append(' '.join([str(i) for i in test.iloc[row,2:27]]))

In [None]:
# bag-of-words
test_data = cv.transform(test_headlines)

In [None]:
test_data.shape

In [None]:
pred = rf.predict(test_data)
pred

### Evaluation

In [None]:
# confusion matrix
cm = confusion_matrix(test['Label'], pred)
cm

In [None]:
plt.imshow(cm, cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()

labels = ['postivie', 'negative']
tick_marks = np.arange(len(labels))
plt.xticks(tick_marks, labels, rotation=90)
plt.yticks(tick_marks, labels)

plt.tight_layout()
plt.xlabel('True Labels')
plt.ylabel('Predicted Labels')

In [None]:
# accuracy score
accuracy_score(test['Label'], pred)

In [None]:
# classification report
report = classification_report(test['Label'], pred)
print(report)

## Naive Bayes Classifier

In [None]:
nb = MultinomialNB()

In [None]:
nb.fit(train_data_x, train['Label'])

In [None]:
prediction = nb.predict(test_data)
prediction

In [None]:
# confusion matrix
cm2 = confusion_matrix(test['Label'], prediction)
cm2

In [None]:
def plot_confusion_matrix(cm, title='Confusion Matrix'):
    plt.imshow(cm, cmap=plt.cm.Blues)
    plt.title(title)
    plt.colorbar()

    labels = ['postivie', 'negative']
    tick_marks = np.arange(len(labels))
    plt.xticks(tick_marks, labels, rotation=90)
    plt.yticks(tick_marks, labels)

    plt.tight_layout()
    plt.xlabel('True Labels')
    plt.ylabel('Predicted Labels')

In [None]:
plot_confusion_matrix(cm2)

In [None]:
# classification report
report2 = classification_report(test['Label'], prediction)
print(report2)

In [None]:
# accuracy score
accuracy_score(test['Label'], prediction)