# Importing Necessary Libraries

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
import seaborn as sns

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Understanding the Data

In [None]:
df=pd.read_csv('/kaggle/input/stockmarket-sentiment-dataset/stock_data.csv')
df.head()

### Shape of the dataset

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

### Check for duplicate entry

In [None]:
duplicate=df[df.duplicated()]
duplicate

There is no duplicate entry present in the data

In [None]:
df['Sentiment'].value_counts()

In [None]:
sns.countplot(x='Sentiment',data=df)

In [None]:
corpus=[]

### Data Preprocessing

In [None]:
for i in range(len(df)):
    words=df.iloc[i,0]
    words=re.sub('[^a-zA-Z]',' ',words)
    words=words.lower()
    words=words.split()
    words=[PorterStemmer().stem(word) for word in words if word not in set(stopwords.words('english'))]
    corpus.append(' '.join(words))

In [None]:
len(corpus)

In [None]:
vec=CountVectorizer(max_features=5000)
X=vec.fit_transform(corpus).toarray()

In [None]:
vec.get_feature_names()[0:20]

In [None]:
X.shape

In [None]:
y=df['Sentiment']

# Model Creation

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
X_train.shape,y_train.shape,X_test.shape,y_test.shape

In [None]:
rfc=RandomForestClassifier(n_estimators=250,max_depth=None)
rfc.fit(X_train,y_train)
y_pred=rfc.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True,fmt='.0f')

In [None]:
print(classification_report(y_test,y_pred))