In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Perform necessary imports

In [None]:
import pandas as pd

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

import seaborn as sns

# Read data

In [None]:
df = pd.read_csv(r'../input/stocknews/Combined_News_DJIA.csv')

In [None]:
df.head()

## Check for null values

In [None]:
df.isnull().sum()

We have null values for columns - 'Top23', 'Top24', and 'Top25'

We will remove the rows that contain null values

In [None]:
df.dropna(inplace= True)

check for null values again

In [None]:
df.isnull().sum()

# Visualize the distribution of target variable

In [None]:
sns.countplot(x=df['Label'])

The dataset is balanced in terms of number of tagets

# Split the dataset into training and test data

In [None]:
df_train = df[df['Date'] < '20150101']
df_test = df[df['Date'] > '20141231']

# Feature Engineering

## Here we will define the functions for performing feature engineering

In [None]:
# function for cleaning the data
def clean_data(dataset):
    data = dataset.iloc[:,2:27]
    data.replace("[^a-zA-Z]", " ", regex=True, inplace=True)
    return data

# function for combining the headlines of all the columns into single column
def combine_data(data):
    headlines = []
    for i in range(len(data.index)):
        headlines.append(' '.join(str(x) for x in data.iloc[i, :]))
    return headlines

# function to perform lemmatization of the word
def lemmatize_data(data, lemmatizer):
    cleaned_dataset = []
    for i in range(len(data)):
        clean_text = data[i].lower()
        clean_text = clean_text.split()
        clean_text = [lemmatizer.lemmatize(word) for word in clean_text if word not in stopwords.words('english')]
        cleaned_dataset.append(' '.join(clean_text))
    return cleaned_dataset

# function to vectorize the data
def vectorize_data(data, cv):
    vectorized_dataset = cv.fit_transform(data)
    return vectorized_dataset

# Clean train and test data

In [None]:
# clean train and test data
clean_train_data = clean_data(df_train)
clean_test_data = clean_data(df_test)

# Combine headlines

In [None]:
# combine the headlines in single column
comb_train_data = combine_data(clean_train_data)
comb_test_data = combine_data(clean_test_data)

# Create Lemmatizer object

In [None]:
lemmatizer = WordNetLemmatizer()

## Lemmatize the data

In [None]:
# lemmatize data
train_data = lemmatize_data(comb_train_data, lemmatizer)
test_data = lemmatize_data(comb_test_data, lemmatizer)

# Create CountVectorizer object

In [None]:
cv = CountVectorizer(ngram_range=(2,2))

# Vectorize the data

In [None]:
# vectorize data
vec_train_data = vectorize_data(train_data, cv)
vec_test_data = cv.transform(test_data)

# Create Random Forest Classifier

In [None]:
# create classifier
rf_clf = RandomForestClassifier(n_estimators=200, criterion='entropy')
rf_clf.fit(vec_train_data, df_train['Label'])

# Predictions

In [None]:
# run precictions on test data
y_pred = rf_clf.predict(vec_test_data)

# Check Accuracy

In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
confusion_matrix(df_test['Label'], y_pred)

In [None]:
print(classification_report(df_test['Label'], y_pred))

In [None]:
print(accuracy_score(df_test['Label'], y_pred).round(2))

## If you found this notebook helpful. Please help me by upvoting this notebook. Thank you. All kind of suggestions are welcome. :)