In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Importing required libraries

In [None]:
import re
import nltk
from nltk.stem import PorterStemmer
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

Download NLTK packages
- stopwords: for stopwords
- punkt: for stemmatization
- wordnet: for lemmatization

In [None]:
nltk.download(['stopwords','punkt','wordnet'])

Read data

In [None]:
df=pd.read_csv('../input/predict-stock-price-based-on-news-headline/Data.csv', encoding = "ISO-8859-1")
df.head()

In [None]:
df.shape

# Data pre-processing
1. Join headlines colums Top1 through Top 25 into a single column called 'Headlines'
2. Keep only required columns (Date, Label, Headlines)

In [None]:
df['Headlines']=(df.iloc[:,2:]+' ').astype(str).values.sum(axis=1)
df=df[['Date','Label','Headlines']]
df.head()

Get date range and split data based on dates.

In [None]:
df['Date'].min(),df['Date'].max()

### Train test split

In [None]:
train=df[df['Date']<'2015-01-01']
test=df[df['Date']>='2015-01-01']
train.shape,test.shape

# NLP using NLTK

### Text preprocessing
- text preprocessing or clean-up
- stemmatization
- lemmatization

In [None]:
stemmer=PorterStemmer()
lemmatizer=WordNetLemmatizer()
stopwords_en=set(stopwords.words('english'))

In [None]:
# Function of text preprocessing or clean-up
def preprocess(msg):
  msg=msg.lower()
  msg=re.sub('[^a-zA-Z]',' ',msg)
  msg=[w for w in msg.split() if w not in stopwords_en]
  return ' '.join(msg)

In [None]:
# Function: preprocessing with stemming
def stemmatize(msg):
  msg=msg.lower()
  msg=re.sub('[^a-zA-Z]',' ',msg)
  msg=[stemmer.stem(w) for w in msg.split() if w not in stopwords_en]
  return ' '.join(msg)

In [None]:
# Function: preprocessing with lemmatization
def lemmatize(msg):
  msg=msg.lower()
  msg=re.sub('[^a-zA-Z]',' ',msg)
  msg=[lemmatizer.lemmatize(w) for w in msg.split() if w not in stopwords_en]
  return ' '.join(msg)

In [None]:
 # Stemmatize, lemmatize or just pre-process
 train['Wordlist']=train['Headlines'].apply(preprocess)
 train.head()

In [None]:
X_train=train['Wordlist']
y_train=train['Label']

### Vectorization
- Bag of words
- TF-IDF

In [None]:
# Bag of words
bow=CountVectorizer(ngram_range=(2,2))
X_train=bow.fit_transform(X_train)

# Training the model (sklean)
- Random Forest Classifier

In [None]:
rf_clf=RandomForestClassifier(n_estimators=200,criterion='entropy')
rf_clf.fit(X_train,y_train)

# Testing the model

In [None]:
# Pre-processing test data
test['Wordlist']=test['Headlines'].apply(preprocess)
X_test =test['Wordlist']
X_test=bow.transform(X_test)
y_test=test['Label']

In [None]:
# Predictions
y_pred=rf_clf.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
print(confusion_matrix(y_test,y_pred))

50% accuracy has no utility. Too many false positives.