<h1>Stocks Data Sentiment Analysis.</h1>

<h3>Importing Data</h3>

In [1]:
import warnings

warnings.simplefilter("ignore")

In [2]:
import numpy as np
import pandas as pd

<h3>Importing Stocks Comments Model.</h3>
<p>Using the pandas load stocks_data csv file.<br>
    Looking Head
    <table>
        <tr>
            <th>Text</th>
            <th>Sentiment</th>
        </tr>
        <tr>
            <td>Stocks Comments Data. Need to purify this comments. This Comment will be very helpful to predict the stock to invest.</td>
            <td>Positive/Negative</td>
        </tr>
        </table>
</p>

In [3]:
stocks_data  = pd.read_csv('stock_data.csv')
stocks_data.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


<h3>Importing Libraries</h3>

In [4]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ethender/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
len(stocks_data)

5791

<h3>Cleaning The Data.</h3>
<p>Cleaning english stopwords. changing words to past, future to present words.</p>

In [6]:
corpus = []
for i in range(0,5791):
    review = re.sub('[^a-zA-Z]',' ',stocks_data['Text'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ''.join(review)
    corpus.append(review)

<p>
    <ul>
        <li>Above checking every word.</li>
        <li>Every word checks removing vocabulary word.</li>
        <li>Changing Past/Future to Present word.</li>
    </ul>
</p>

<p>
    <ol>
        <li>X : Cleaned words is ready to send machine learning to predict.</li>
        <li>Y : Positive words / Negative words</li>
    </ol>
</p>

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2000)
x = cv.fit_transform(corpus).toarray()
y = stocks_data.iloc[:,-1].values

In [8]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=42)

In [9]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(x_train,y_train)

GaussianNB()

In [10]:
y_pred = nb.predict(x_test)

In [11]:
from sklearn.metrics import confusion_matrix, accuracy_score

confusion_matrix(y_test,y_pred)

array([[ 619,    0],
       [1109,   10]])

In [12]:
accuracy_score(y_test,y_pred)

0.3619102416570771

In [13]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(x_train,y_train)
y_knn_pred = knn.predict(x_test)
confusion_matrix(y_test,y_knn_pred)

array([[ 614,    5],
       [1110,    9]])

<p>
    <ul>
        <li>After converting words &amp; further processing data.</li>
        <li>Feeding into the machine learning models.</li>
        <li>But accuracy still not high. Need further changes.</li>
    </ul>
</p>

<h2>Alternate way changing.</h2>

In [14]:
x_train,x_test,y_train,y_test = train_test_split(stocks_data['Text'],stocks_data.iloc[:,-1].values,test_size=0.2,random_state=42)

<p>
    <ul>
        <li>Written new class.</li>
        <li>Taking Every word.</li>
        <li>Converting every word to lowercase.</li>
        <li>Converting every number to number literal.</li>
        <li>Removing every punctuation on the word.</li>
        <li>Now appliying past/future to present.</li>
        <li>Returning changing words.</li>
    </ul>
</p>

In [15]:
from sklearn.base import BaseEstimator, TransformerMixin
from collections import Counter

stemmer = nltk.PorterStemmer()

class WordCounterTranformer(BaseEstimator, TransformerMixin):
    
    def __init__(self,strip_headers=True,lower_case=True,remove_punctuation=True,
                replace_urls=True,replace_numbers=True,stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    
    def fit(self,x,y=None):
        return self
    
    def transform(self,x,y=None):
        x_transformed = []
        for email in x:
            text = email or ''
            if self.lower_case:
                text = text.lower()
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?','NUMBER',text)
            if self.remove_punctuation:
                text = re.sub(r'\W+',' ',text,flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            x_transformed.append(word_counts)
        return np.array(x_transformed)

In [16]:
x_few_wordcounts = WordCounterTranformer().fit_transform(x_train)
x_few_wordcounts

array([Counter({'number': 2, 'ove': 1, 'gnw': 1, 'call': 1, 'are': 1, 'make': 1, 'me': 1, 'feel': 1, 'better': 1, 'about': 1, 'myself': 1, 'for': 1, 'not': 1, 'dump': 1, 'aap': 1, 'sooner': 1}),
       Counter({'number': 6, 'jan': 2, 'csn': 1, 'option': 1, 'trader': 1, 'buy': 1, 'of': 1, 'the': 1, 'call': 1, 'spread': 1, 'against': 1, 'low': 1, 'oi': 1, 'indic': 1, 'enter': 1, 'a': 1, 'posit': 1, 'for': 1, 'bet': 1, 'on': 1, 'data': 1, 'bef': 1}),
       Counter({'and': 2, 'peopl': 1, 'slag': 1, 'aap': 1, 'for': 1, 'cannib': 1, 'but': 1, 'samsung': 1, 'ha': 1, 'number': 1, 'phone': 1, 'no': 1, 'cult': 1, 'or': 1, 'itun': 1, 'it': 1, 'is': 1, 'the': 1, 'new': 1, 'cool': 1, 'idontthinkso': 1}),
       ...,
       Counter({'have': 2, 'rt': 1, 'jchengwsj': 1, 'in': 1, 'hindsight': 1, 'wall': 1, 'street': 1, 'probabl': 1, 'shouldn': 1, 't': 1, 'let': 1, 'luckin': 1, 'coffe': 1, 's': 1, 'chairman': 1, 'more': 1, 'or': 1, 'less': 1, 'unfett': 1, 'access': 1, 'to': 1, 'half': 1, 'aâ': 1}),
   

<p>
        <ul>
            <li>Below class written for checking every word.</li>
            <li>Checking every word and count.</li>
            <li>Final, returning csr_matrix means <b>sparse matrix.</b></li>
        </ul>
</p>

<p>
    <b>What is sparse matrix?</b><br>
    In numerical analysis and scientific computing, a sparse matrix or sparse array is a matrix in which most of the elements are zero. By contrast, if most of the elements are nonzero, then the matrix is considered dense.
    <table>
        <tr>
            <td>5</td>
            <td>0</td>
            <td>0</td>
        </tr>
        <tr>
            <td>0</td>
            <td>12</td>
            <td>0</td>
        </tr>
         <tr>
            <td>0</td>
            <td>0</td>
            <td>47</td>
        </tr>
        </table>
</p>

In [17]:
from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self,vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, x, y=None):
        total_count = Counter()
        for word_count in x:
            for word, count in word_count.items():
                total_count[word] +=  min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.most_common = most_common
        self.vocabulary_ = {word: index +1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self,x,y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(x):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word,0))
                data.append(count)
        return csr_matrix((data,(rows,cols)),shape=(len(x),self.vocabulary_size+1))

In [18]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
x_few_vectors = vocab_transformer.fit_transform(x_few_wordcounts)
x_few_vectors

<4632x11 sparse matrix of type '<class 'numpy.longlong'>'
	with 14593 stored elements in Compressed Sparse Row format>

In [19]:
x_few_vectors.toarray()

array([[13,  2,  0, ...,  1,  1,  0],
       [17,  6,  1, ...,  0,  1,  0],
       [18,  1,  1, ...,  1,  1,  0],
       ...,
       [21,  0,  0, ...,  0,  0,  1],
       [16,  0,  2, ...,  0,  0,  1],
       [11,  0,  0, ...,  0,  0,  0]], dtype=int64)

In [20]:
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ('text_wordcount', WordCounterTranformer()),
    ('wordcount_to_vector', WordCounterToVectorTransformer()),
])
x_train_transformed = preprocess_pipeline.fit_transform(x_train)

<h2>Machine Learning Models.</h2>
<p>
    <ul>
        <li>Checking Logistic Regression.</li>
        <li>Cross-Validation &amp; Making 3 epochs.</li>
        <li>These shows 77% accuracy.</li>
        <li>Precision: 80.46%.  Means 80.46% correctly Stocks 1 predicted.</li>
        <li>Recall: 86.07%. Means 86.07% correctly Stocks 0 predicted.</li>
    </ul>
</p>

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver='lbfgs',random_state=42)
score = cross_val_score(log_clf,x_train_transformed,y_train,cv=3,verbose=3)
score.mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV]  ................................................................
[CV] .................................... , score=0.760, total=   0.1s
[CV]  ................................................................
[CV] .................................... , score=0.765, total=   0.1s
[CV]  ................................................................
[CV] .................................... , score=0.775, total=   0.1s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s finished


0.7666234887737479

In [22]:
from sklearn.metrics import precision_score, recall_score

x_test_transformed = preprocess_pipeline.transform(x_test)

log_clf = LogisticRegression(solver='lbfgs',random_state=42)
log_clf.fit(x_train_transformed,y_train)

y_pred = log_clf.predict(x_test_transformed)
print('Precisions: {:.2f}%'.format(100*precision_score(y_test,y_pred)))
print('Recall: {:.2f}%'.format(100*recall_score(y_test,y_pred)))

Precisions: 80.46%
Recall: 86.07%


<h2>Bagging Classifier.</h2>
<p>
    Bagging Classifier will be helpful to train multiple set of samples and estimating.
</p>

In [23]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),n_estimators=500,
    max_samples=100,bootstrap=True,n_jobs=-1,oob_score=True)
bag_clf.fit(x_train_transformed,y_train)
y_pred = bag_clf.predict(x_test_transformed)

In [24]:
accuracy_score(y_test,y_pred)

0.7083692838654012

In [25]:
bag_clf.oob_score_

0.6996977547495682

<h2>Ensemble Method</h2>
<p>
    <ul>
        <li>Ensemble method will be combine of multiple machine learning algorithms.</li>
        <li>These will be helpful. Combining all results show final better result.</li>
    </ul>
</p>

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [30]:
log_clf = LogisticRegression(solver='lbfgs',random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100,random_state=42)
svm_clf = SVC(gamma='scale',probability=True,random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)
decision_clf = DecisionTreeClassifier()

voting_clf = VotingClassifier(estimators=[('log',log_clf),
                                         ('rnd',rnd_clf),
                                         ('svm',svm_clf),
                                         ('knn',knn),
                                         ('tree',decision_clf)],
                             voting='soft')
voting_clf.fit(x_train_transformed,y_train)

VotingClassifier(estimators=[('log', LogisticRegression(random_state=42)),
                             ('rnd', RandomForestClassifier(random_state=42)),
                             ('svm', SVC(probability=True, random_state=42)),
                             ('knn', KNeighborsClassifier()),
                             ('tree', DecisionTreeClassifier())],
                 voting='soft')

In [31]:
for clf in (log_clf,rnd_clf,svm_clf,knn,decision_clf,voting_clf):
    clf.fit(x_train_transformed,y_train)
    y_pred = clf.predict(x_test_transformed)
    print(clf.__class__.__name__,accuracy_score(y_test,y_pred))
    print('Precisions: {:.2f}%'.format(100*precision_score(y_test,y_pred)))
    print('Recall: {:.2f}%'.format(100*recall_score(y_test,y_pred)))

LogisticRegression 0.7799827437446074
Precisions: 80.46%
Recall: 86.07%
RandomForestClassifier 0.7972389991371872
Precisions: 82.48%
Recall: 86.20%
SVC 0.7791199309749784
Precisions: 77.36%
Recall: 91.94%
KNeighborsClassifier 0.6937014667817084
Precisions: 73.36%
Recall: 80.87%
DecisionTreeClassifier 0.724762726488352
Precisions: 78.88%
Recall: 77.05%
VotingClassifier 0.7937877480586712
Precisions: 81.72%
Recall: 86.75%
