## Importing required libraries

In [1]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix

## Reading the input

In [2]:
stock_data = pd.read_csv("../input/stock-sentiment-analysis/Stock_Dataa.csv", encoding = 'unicode_escape')

### Renaming the column names

In [3]:
stock_data.columns = stock_data.columns.str.replace("Top","")

In [4]:
# Getting only features
features = stock_data[stock_data.columns.drop(['Date', 'Label'])]

## Preprocessing the features

In [5]:
lemmatizer = WordNetLemmatizer()

In [6]:
stock_data["combined_features"] = features.apply(lambda x:" ".join(x.values.astype('str')), axis=1)
stock_data["combined_features"] = stock_data["combined_features"].apply(lambda x:" ".join(lemmatizer.lemmatize(word) for word in nltk.word_tokenize(re.sub("[^a-zA-Z]", " ", x).lower()) if word not in stopwords.words("english")))
train_data = stock_data[stock_data['Date'] < '20150101']
test_data = stock_data[stock_data['Date'] > '20141231']

In [7]:
count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

## Training the data

In [8]:
train = tfidf_vectorizer.fit_transform(train_data["combined_features"].values)

In [9]:
rfc = RandomForestClassifier()
gv = GridSearchCV(rfc, {}, cv=3)
gv.fit(train, list(train_data["Label"].values))
model = gv.best_estimator_




## Predicting the model

In [10]:
test = tfidf_vectorizer.transform(test_data["combined_features"].values)

In [11]:
y_out = model.predict(test)

## Model Evaluation

In [12]:
confusion_matrix(list(test_data["Label"].values), y_out)

array([[143,  43,   0, ...,   0,   0,   0],
       [ 12, 180,   0, ...,   0,   0,   0],
       [  1,   0,   0, ...,   0,   0,   0],
       ...,
       [  0,   1,   0, ...,   0,   0,   0],
       [  1,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,  43]])

In [13]:
accuracy_score(list(test_data["Label"].values), y_out) * 100

74.54175152749491