In [11]:
import pandas as pd
import spacy as sp
from spacy.util import minibatch, compounding
import os
import random
from spacy.lang.en.stop_words import STOP_WORDS
pd.set_option('display.max_colwidth', None)

In [26]:
#read from stock kaggle dataset
stock_df = pd.read_csv('stock_data.csv')
stock_df.head()

Unnamed: 0,Text,Sentiment
0,"Kickers on my watchlist XIDE TIT SOQ PNK CPW BPZ AJ trade method 1 or method 2, see prev posts",1
1,user: AAP MOVIE. 55% return for the FEA/GEED indicator just 15 trades for the year. AWESOME.,1
2,user I'd be afraid to short AMZN - they are looking like a near-monopoly in eBooks and infrastructure-as-a-service,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [24]:
def load_training_data(
    split: float = 0.8,
    limit: int = 0
) -> tuple:
    tweets = []
    
    stop_words = sp.lang.en.stop_words.STOP_WORDS
    
    for ind, row in stock_df.iterrows():
        tweet = row['Text']
        label = row['Sentiment']
        spacy_label = {
            "cats": {
                "pos": 1 == label,
                "neg": -1 == label}
        }
        
        #remove stopwords from the tweet before appending it
        tweet_fixed = [word for word in tweet if word not in stop_words]
        
        tweets.append((tweet_fixed, spacy_label))
        
    random.shuffle(tweets)

    if limit:
        tweets = tweets[:limit]
    split = int(len(tweets) * split)
    return tweets[:split], tweets[split:]

In [20]:
def evaluate_model(
    tokenizer, textcat, test_data: list
) -> dict:
    tweets, labels = zip(*test_data)
    tweets = (tokenizer(tweet) for tweet in tweets)
    true_positives = 0
    false_positives = 1e-8  # Can't be 0 because of presence in denominator
    true_negatives = 0
    false_negatives = 1e-8
    for i, tweet in enumerate(textcat.pipe(tweets)):
        true_label = labels[i]
        print(true_label)
        for predicted_label, score in tweet.cats.items():
            # Every cats dictionary includes both labels. You can get all
            # the info you need with just the pos label.
            if (
                predicted_label == "neg"
            ):
                continue
            if score >= 0.5 and true_label["pos"]:
                true_positives += 1
            elif score >= 0.5 and true_label["neg"]:
                false_positives += 1
            elif score < 0.5 and true_label["neg"]:
                true_negatives += 1
            elif score < 0.5 and true_label["pos"]:
                false_negatives += 1
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)

    if precision + recall == 0:
        f_score = 0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"precision": precision, "recall": recall, "f-score": f_score}

In [21]:
def train_model(
    training_data: list,
    test_data: list,
    iterations: int = 20
) -> None:
    # Build pipeline
    nlp = sp.load("en_core_web_sm")
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"architecture": "ensemble"}
        )
        nlp.add_pipe(textcat, last=True)
    else:
         textcat = nlp.get_pipe("textcat")

    textcat.add_label("pos")
    textcat.add_label("neg")
    
    print(nlp.pipe_names)
    
    #only trains textcat pipe
    training_excluded_pipes = [
        pipe for pipe in nlp.pipe_names if pipe != "textcat"
    ]
    
    print(training_excluded_pipes)
    
    with nlp.disable_pipes(training_excluded_pipes):
        optimizer = nlp.begin_training()
        # Training loop
        print("Beginning training")
        print("Loss\tPrecision\tRecall\tF-score")
        batch_sizes = compounding(
            4.0, 32.0, 1.001
        )  # A generator that yields infinite series of input numbers
        
        for i in range(iterations):
            loss = {}
            random.shuffle(training_data)
            batches = minibatch(training_data, size=batch_sizes)
            for batch in batches:
                text, labels = zip(*batch)
                nlp.update(
                    text,
                    labels,
                    drop=0.2,
                    sgd=optimizer,
                    losses=loss
                )
                
            '''with textcat.model.use_params(optimizer.averages):
                evaluation_results = evaluate_model(
                    tokenizer=nlp.tokenizer,
                    textcat=textcat,
                    test_data=test_data
                )
                print(
                    f"{loss['textcat']}\t{evaluation_results['precision']}"
                    f"\t{evaluation_results['recall']}"
                    f"\t{evaluation_results['f-score']}"
                )'''
    # Save model
    with nlp.use_params(optimizer.averages):
        nlp.to_disk("model_artifacts")

In [22]:
TEST = "Food Sector as always doing extremely well: PZZA DPZ JACK MCD CAKE"

def test_model(input_data: str = TEST):
    #  Load saved trained model
    loaded_model = sp.load("model_artifacts")
    # Generate prediction
    parsed_text = loaded_model(input_data)
    # Determine prediction to return
    prediction = 0
    if parsed_text.cats["pos"] > parsed_text.cats["neg"]:
        prediction = 1
        score = parsed_text.cats["pos"]
    else:
        prediction = -1
        score = parsed_text.cats["neg"]
    #print(
        #f"Predicted sentiment: {prediction}"
        
        #f"\tScore: {score}"
    #)
    return prediction
    


In [25]:
train, test = load_training_data()
train_model(train, test)

['tagger', 'parser', 'ner', 'textcat']
['tagger', 'parser', 'ner']


ValueError: [E001] No component '['tagger', 'parser', 'ner']' found in pipeline. Available names: ['tagger', 'parser', 'ner', 'textcat']

In [None]:
y_true = []
predicted = []
for i in range(len(test)):
    y_true.append(test[i][1]['cats']['pos'] == True)
    predicted.append(test_model(test[i][0]))

In [None]:
from sklearn.metrics import precision_recall_fscore_support

#(precision, recall, F1, support = None)
precision_recall_fscore_support(y_true, predicted, average='micro')

# Random stuff below

In [31]:
#resource/tutorial used:  https://www.dataquest.io/blog/tutorial-text-classification-in-python-using-spacy/

import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = sp.load('en_core_web_sm')
stop_words = sp.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

X = stock_df['Text'] # the features we want to analyze
ylabels = stock_df['Sentiment'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)


In [33]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x0000021D41B10DA0>), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
      ...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [37]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))
print('Logistic Regression F1:', metrics.f1_score(y_test, predicted))

Logistic Regression Accuracy: 0.7825086306098964
Logistic Regression Precision: 0.7993366500829188
Logistic Regression Recall: 0.8763636363636363
Logistic Regression F1: 0.8360797918473547
