### Import statements 

In [1]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import statistics
from sklearn import metrics
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import numpy as np
from nltk.corpus import stopwords
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
import calendar
import datetime
import string

sia = SentimentIntensityAnalyzer()


### Import and clean data 

In [2]:
# Import and clean reddit data
reddit = pd.read_csv("reddit_clean_1.csv")
reddit = reddit.rename(columns = {"title": "Headlines"}).dropna(subset=["Headlines"]).reset_index()
reddit_50 = reddit.iloc[0:50,:]

# Import and clean seeking alpha analyst headlines
sa = pd.read_csv("SeekingAlpha_Coded.csv")
sa_100 = sa.iloc[0:100,:]

# Import and clean seeking alpha news headlines
san = pd.read_csv("SeekingAlpha_News_Coded.csv")
san_75 = san.iloc[0:75,:]

# Import and clean Twitter data
tw = pd.read_csv("Twitter_clean_1.csv")
tw = tw.rename(columns = {"Text": "Headlines"}).dropna(subset=["Headlines"]).reset_index()
tw_50 = tw.iloc[0:50,:]

In [3]:
# Function to get top 100 positive, negative and neutral words

def top_100 (df):
    
    #train test split
    X = df.iloc[:, 1:-1]
    y = df.iloc[:,-1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    
    #tuning nltk 
    X_y_train = X_train.join(y_train).reset_index()
    X_y_test = X_test.join(y_test).reset_index()
    
    pos = X_y_train[X_y_train["Classification"] == "pos"]["Headlines"]
    neg = X_y_train[X_y_train["Classification"] == "neg"]["Headlines"]
    neu = X_y_train[X_y_train["Classification"] == "neu"]["Headlines"]
    
    pos_sentence_list = pos.apply(nltk.word_tokenize)
    pos_word_list = [item.lower() for sublist in pos_sentence_list for item in sublist]
    neg_sentence_list = neg.apply(nltk.word_tokenize)
    neg_word_list = [item.lower() for sublist in neg_sentence_list for item in sublist]
    neu_sentence_list = neu.apply(nltk.word_tokenize)
    neu_word_list = [item.lower() for sublist in neu_sentence_list for item in sublist]

    positive_fd = nltk.FreqDist(pos_word_list)
    negative_fd = nltk.FreqDist(neg_word_list)
    neutral_fd = nltk.FreqDist(neu_word_list)

    common_set = set(neutral_fd).intersection(negative_fd).union(set(positive_fd).intersection(negative_fd)).union(set(positive_fd).intersection(neutral_fd))

    for word in common_set:
        del positive_fd[word]
        del negative_fd[word]
        del neutral_fd[word]

    top_100_positive = [word for word, count in positive_fd.most_common(100)]
    top_100_negative = [word for word, count in negative_fd.most_common(100)]
    top_100_neutral = [word for word, count in neutral_fd.most_common(100)]
    
    return(top_100_positive, top_100_negative, top_100_neutral)


In [22]:
# Function to generate bigram 

def generate_bigram(sentence):
    
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.wordpunct_tokenize(sentence.lower())
    finder = BigramCollocationFinder.from_words(tokens)
    scored = finder.score_ngrams(bigram_measures.raw_freq)
    return(sorted(bigram for bigram, score in scored))

def bigram_top_100(df):
    
    #train test split
    X = df.iloc[:, 1:-1]
    y = df.iloc[:,-1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    
    #tuning nltk 
    X_y_train = X_train.join(y_train).reset_index()
    X_y_test = X_test.join(y_test).reset_index()
        
    pos = X_y_train[X_y_train["Classification"] == "pos"]["Headlines"]
    neg = X_y_train[X_y_train["Classification"] == "neg"]["Headlines"]
    neu = X_y_train[X_y_train["Classification"] == "neu"]["Headlines"]
        
    pos_sentence_list_bigram = pos.apply(generate_bigram)
    pos_word_list_bigram = [item for sublist in pos_sentence_list_bigram for item in sublist]
    neg_sentence_list_bigram = neg.apply(generate_bigram)
    neg_word_list_bigram = [item for sublist in neg_sentence_list_bigram for item in sublist]
    neu_sentence_list_bigram = neu.apply(generate_bigram)
    neu_word_list_bigram = [item for sublist in neu_sentence_list_bigram for item in sublist]

    positive_fd_bigram = nltk.FreqDist(pos_word_list_bigram)
    negative_fd_bigram = nltk.FreqDist(neg_word_list_bigram)
    neutral_fd_bigram = nltk.FreqDist(neu_word_list_bigram)

    common_set_bigram = set(neutral_fd_bigram).intersection(negative_fd_bigram).union(set(positive_fd_bigram).intersection(negative_fd_bigram)).union(set(positive_fd_bigram).intersection(neutral_fd_bigram))

    for word in common_set_bigram:
        del positive_fd_bigram[word]
        del negative_fd_bigram[word]
        del neutral_fd_bigram[word]

    top_100_positive_bigram = {word for word, count in positive_fd_bigram.most_common(100)}
    top_100_negative_bigram = {word for word, count in negative_fd_bigram.most_common(100)}
    top_100_neutral_bigram = {word for word, count in neutral_fd_bigram.most_common(100)}
    return(top_100_positive_bigram, top_100_negative_bigram, top_100_neutral_bigram)


In [25]:
bigram_top_100(reddit_50)[2]

{('2021', 'or'),
 ('30', '2020'),
 ('40k', 'on'),
 ('7', 'billion'),
 ('800c', 'exp'),
 ('a', 'cookie'),
 ('a', 'model'),
 ('a', 'picture'),
 ('actually', 'from'),
 ('and', 'more'),
 ('berlin', 'one'),
 ('billion', 'dollars'),
 ('bought', 'a'),
 ('build', 'progress'),
 ('cant', 'delete'),
 ('cookie', 'on'),
 ('cost', 'investors'),
 ('covered', '800c'),
 ('daily', 'investor'),
 ('dateprice', 'conjecture'),
 ('december', '30'),
 ('delete', 'this'),
 ('discussion', 'december'),
 ('do', 'we'),
 ('dopfner', 'tesla'),
 ('drip', 'feed'),
 ('ellison', 'in'),
 ('elon', 'actually'),
 ('elon', 'musk'),
 ('exp', '0108'),
 ('feed', 'over'),
 ('for', 'days'),
 ('for', 'q4'),
 ('from', 'plantlabs'),
 ('from', 'the'),
 ('front', 'page'),
 ('fsd', 'subscription'),
 ('funds', 'cost'),
 ('gang', 'wya'),
 ('get', 'the'),
 ('gigafactory', 'berlin'),
 ('had', 'existed'),
 ('hawaii', 'honolulu'),
 ('here', 'for'),
 ('honolulu', 'staradvertiser'),
 ('if', 'it'),
 ('in', '2021'),
 ('in', 'hawaii'),
 ('index', 

In [5]:
# function to filter stop words 
# for word vect

def remove_stop_words(func):
    
    stop_words=set(stopwords.words("english"))

    filtered_pos = []
    filtered_neg = []
    filtered_neu = []
    
    pos = func[0]
    neg = func[1]
    neu = func[2]
    
    for positive in pos:
        if positive not in stop_words:
            filtered_pos.append(positive)

    for neutral in neu:
        if neutral not in stop_words:
            filtered_neu.append(neutral)

    for negative in neg:
        if negative not in stop_words:
            filtered_neg.append(negative)
            
    return(filtered_pos, filtered_neg, filtered_neu)



In [6]:
#reddit_50
#top_100(reddit_50)[1]

### Generate Dataset Features

In [7]:
def extract_features(df, df1):
    
    top_100_positive = top_100(df1)[0]
    top_100_negative = top_100(df1)[1]
    top_100_neutral = top_100(df1)[2]
    
    top_100_positive_bigram = bigram_top_100(df1)[0]
    top_100_negative_bigram = bigram_top_100(df1)[1]
    top_100_neutral_bigram = bigram_top_100(df1)[2]
    
    features = dict()
    combined_pos_count = list()
    combined_neg_count = list()
    combined_neu_count = list()
    combined_pos_count_bigram = list()
    combined_neg_count_bigram = list()
    combined_neu_count_bigram = list()
    combined_compound_scores = list()
    combined_positive_scores = list()
    combined_negative_scores = list()
    combined_neutral_scores = list()
    
    
    
    for i in range(len(df)):
        sentence = df["Headlines"][i]
        pos_count = 0
        neg_count = 0
        neu_count = 0
        pos_bigram_count = 0
        neg_bigram_count = 0
        neu_bigram_count = 0
        
        for word in nltk.word_tokenize(sentence):
            if word.lower() in top_100_positive:
                pos_count += 1
            if word.lower() in top_100_negative:
                neg_count += 1
            if word.lower() in top_100_neutral:
                neu_count += 1
                
        for bigram in generate_bigram(sentence):
            if bigram in top_100_positive_bigram:
                pos_bigram_count += 1
            if bigram in top_100_negative_bigram:
                neg_bigram_count += 1
            if bigram in top_100_neutral_bigram:
                neu_bigram_count += 1

        combined_compound_scores.append(sia.polarity_scores(sentence)["compound"] + 1)
        combined_positive_scores.append(sia.polarity_scores(sentence)["pos"])
        combined_negative_scores.append(sia.polarity_scores(sentence)["neg"])
        combined_neutral_scores.append(sia.polarity_scores(sentence)["neu"])
        combined_pos_count.append(pos_count)
        combined_neg_count.append(neg_count)
        combined_neu_count.append(neu_count)
        combined_pos_count_bigram.append(pos_bigram_count)
        combined_neg_count_bigram.append(neg_bigram_count)
        combined_neu_count_bigram.append(neu_bigram_count)
    
    features["headlines"] = df["Headlines"]
    features["compound"] = combined_compound_scores
    features["positive"] = combined_positive_scores
    features["negative"] = combined_negative_scores
    features["neutral"] = combined_neutral_scores
    features["pos_count"] = combined_pos_count
    features["neg_count"] = combined_neg_count
    features["neu_count"] = combined_neu_count
    features["pos_count_bigram"] = combined_pos_count_bigram
    features["neg_count_bigram"] = combined_neg_count_bigram
    features["neu_count_bigram"] = combined_neu_count_bigram
    #features["Classification"] = input_data["Classification"]
    
        
    return features

In [26]:
X = reddit_50.iloc[:, 1:-1]
y = reddit_50.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

X_y_train = X_train.join(y_train).reset_index()
X_y_test = X_test.join(y_test).reset_index()


X_y_train_extract = pd.DataFrame(extract_features(X_y_train, reddit_50))
X_y_train_extract["Classification"] = X_y_train["Classification"]
X_y_test_extract = pd.DataFrame(extract_features(X_y_test, reddit_50))
X_y_test_extract["Classification"] = X_y_test["Classification"]

X_y_train_extract

Unnamed: 0,headlines,compound,positive,negative,neutral,pos_count,neg_count,neu_count,pos_count_bigram,neg_count_bigram,neu_count_bigram,Classification
0,"papa Elon spoke and we listened, the prophecy ...",1.4215,0.219,0.0,0.781,7,0,0,10,0,0,pos
1,Tesla's exponential growth visualized. As Elon...,1.7003,0.244,0.0,0.756,16,0,0,20,0,0,pos
2,"Tesla could reach 500,000 deliveries in 2020 t...",1.7351,0.31,0.0,0.69,12,0,0,18,0,0,pos
3,FSD Subscription Start Date/Price Conjecture,1.0,0.0,0.0,1.0,0,0,4,0,0,4,neu
4,Captured a glorious moment today,1.6369,0.583,0.0,0.417,4,0,0,4,0,0,pos
5,'Big Short' investor Michael Burry has already...,0.3631,0.0,0.215,0.785,19,0,0,20,0,0,pos
6,69420,1.0,0.0,0.0,1.0,0,0,1,0,0,0,neu
7,"Margin questions, specifically regarding TSLA",1.0,0.0,0.0,1.0,0,0,4,0,0,4,neu
8,Elon Musk meets with Oracles Larry Ellison in ...,1.0,0.0,0.0,1.0,0,0,9,0,0,10,neu
9,"$TSLA Daily Investor Discussion - December 30,...",1.0,0.0,0.0,1.0,0,0,3,0,0,6,neu


### Multinomial NB 

In [44]:
def accuracy_nb (df):

    X = df.iloc[:, 1:-1]
    y = df.iloc[:,-1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

    X_y_train = X_train.join(y_train).reset_index()
    X_y_test = X_test.join(y_test).reset_index()

    #X_y_train_extract = pd.DataFrame(extract_features(X_y_train))
    #X_y_test_extract = pd.DataFrame(extract_features(X_y_test))
    
    # choose bigram or tokenization
    
    X_y_train_extract = pd.DataFrame(extract_features(X_y_train, df)).drop(["pos_count", "neg_count", "neu_count"], axis = 1)
    X_y_train_extract["Classification"] = X_y_train["Classification"]
    X_y_test_extract = pd.DataFrame(extract_features(X_y_test, df)).drop(["pos_count", "neg_count", "neu_count"], axis = 1)
    X_y_test_extract["Classification"] = X_y_test["Classification"]

    X_train = X_y_train_extract.iloc[:, 1:-1]
    y_train = X_y_train_extract.iloc[:, -1]
    X_test = X_y_test_extract.iloc[:, 1:-1]
    y_test = X_y_test_extract.iloc[:, -1]

    nb = MultinomialNB()
    nb.fit(X_train, y_train)
    y_pred_prob = nb.predict(X_test)

    count = 0

    for i in range(len(y_pred_prob)):
        if y_pred_prob[i] == np.array(y_test)[i]:
            count += 1

    bi_accuracy = count/len(y_pred_prob)

    # =============================================================
    
    X = df.iloc[:, 1:-1]
    y = df.iloc[:,-1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

    X_y_train = X_train.join(y_train).reset_index()
    X_y_test = X_test.join(y_test).reset_index()
    
    X_y_train_extract = pd.DataFrame(extract_features(X_y_train, df)).drop(["pos_count_bigram", "neg_count_bigram", "neu_count_bigram"], axis = 1)
    X_y_train_extract["Classification"] = X_y_train["Classification"]
    X_y_test_extract = pd.DataFrame(extract_features(X_y_test, df)).drop(["pos_count_bigram", "neg_count_bigram", "neu_count_bigram"], axis = 1)
    X_y_test_extract["Classification"] = X_y_test["Classification"]
    
    X_train = X_y_train_extract.iloc[:, 1:-1]
    y_train = X_y_train_extract.iloc[:, -1]
    X_test = X_y_test_extract.iloc[:, 1:-1]
    y_test = X_y_test_extract.iloc[:, -1]

    nb = MultinomialNB()
    nb.fit(X_train, y_train)
    y_pred_prob = nb.predict(X_test)

    count = 0

    for i in range(len(y_pred_prob)):
        if y_pred_prob[i] == np.array(y_test)[i]:
            count += 1

    vect_accuracy = count/len(y_pred_prob)
    
    # =============================================================
    
    X = df.iloc[:, 1:-1]
    y = df.iloc[:,-1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

    X_y_train = X_train.join(y_train).reset_index()
    X_y_test = X_test.join(y_test).reset_index()
    
    X_y_train_extract = pd.DataFrame(extract_features(X_y_train, df))
    X_y_train_extract["Classification"] = X_y_train["Classification"]
    X_y_test_extract = pd.DataFrame(extract_features(X_y_test, df))
    X_y_test_extract["Classification"] = X_y_test["Classification"]
    
    X_train = X_y_train_extract.iloc[:, 1:-1]
    y_train = X_y_train_extract.iloc[:, -1]
    X_test = X_y_test_extract.iloc[:, 1:-1]
    y_test = X_y_test_extract.iloc[:, -1]

    nb = MultinomialNB()
    nb.fit(X_train, y_train)
    y_pred_prob = nb.predict(X_test)

    count = 0

    for i in range(len(y_pred_prob)):
        if y_pred_prob[i] == np.array(y_test)[i]:
            count += 1

    combine_accuracy = count/len(y_pred_prob)
    
    if (bi_accuracy > vect_accuracy) & (bi_accuracy > combine_accuracy): 
        print ("Bigram performs the best with accuracy of:", bi_accuracy)
    elif (combine_accuracy > bi_accuracy) & (combine_accuracy > vect_accuracy):
        print("Combined performs the best with accuracy of:", combine_accuracy)
    else: 
        print("Vectorization performs the best with accuracy of:", vect_accuracy)
    print("\n")    
    print("Bigram accuracy: ", bi_accuracy)
    print("Combined accuracy: ", combine_accuracy)
    print("Vectorization accuracy: ", vect_accuracy)

In [45]:
accuracy_nb(reddit_50)

Bigram performs the best with accuracy of: 0.7692307692307693


Bigram accuracy:  0.7692307692307693
Combined accuracy:  0.6923076923076923
Vectorization accuracy:  0.6153846153846154


In [46]:
accuracy_nb(tw_50)

Vectorization performs the best with accuracy of: 0.6923076923076923


Bigram accuracy:  0.5384615384615384
Combined accuracy:  0.6923076923076923
Vectorization accuracy:  0.6923076923076923


In [47]:
accuracy_nb(san_75)

Vectorization performs the best with accuracy of: 0.631578947368421


Bigram accuracy:  0.631578947368421
Combined accuracy:  0.5789473684210527
Vectorization accuracy:  0.631578947368421


In [48]:
accuracy_nb(sa_100)

Vectorization performs the best with accuracy of: 0.64


Bigram accuracy:  0.52
Combined accuracy:  0.6
Vectorization accuracy:  0.64
