In [761]:
import pandas as pd
import numpy as np
import random
import io
import pickle
import twitter
import nltk
from nltk.classify import NaiveBayesClassifier
import string

Load lists of stock advancers/decliners into dataframes

In [86]:
# Load list of NASDAQ stocks

nasdaq_list_df = pd.read_csv('./data/nasdaqlisted.txt', sep='|')

nasdaq_list_df.head()

Unnamed: 0,Symbol,Security Name,Market Category,Test Issue,Financial Status,Round Lot Size,ETF,NextShares
0,AACG,ATA Creativity Global - American Depositary Sh...,G,N,N,100.0,N,N
1,AAL,"American Airlines Group, Inc. - Common Stock",Q,N,N,100.0,N,N
2,AAME,Atlantic American Corporation - Common Stock,G,N,N,100.0,N,N
3,AAOI,"Applied Optoelectronics, Inc. - Common Stock",G,N,N,100.0,N,N
4,AAON,"AAON, Inc. - Common Stock",Q,N,N,100.0,N,N


In [887]:
# Select out ETFs

nasdaq_list_df = nasdaq_list_df[nasdaq_list_df['ETF'] == 'N']

# Check for missing data
nasdaq_list_df[nasdaq_list_df.isnull().any(axis=1)]

Unnamed: 0,Symbol,Security Name,Market Category,Test Issue,Financial Status,Round Lot Size,ETF,NextShares


In [888]:
nasdaq_list_df.head()

Unnamed: 0,Symbol,Security Name,Market Category,Test Issue,Financial Status,Round Lot Size,ETF,NextShares
1,AAL,"American Airlines Group, Inc. - Common Stock",Q,N,N,100.0,N,N
2,AAME,Atlantic American Corporation - Common Stock,G,N,N,100.0,N,N
3,AAOI,"Applied Optoelectronics, Inc. - Common Stock",G,N,N,100.0,N,N
4,AAON,"AAON, Inc. - Common Stock",Q,N,N,100.0,N,N
5,AAPL,Apple Inc. - Common Stock,Q,N,N,100.0,N,N


In [476]:
nasdaq_list_df = nasdaq_list_df[nasdaq_list_df['Security Name'].str.contains('Common Stock')]

# Create a csv file with single column of stock tickers to use for downloading stock closing prices

ticker_list_df = pd.DataFrame((nasdaq_list_df['Symbol']))
print(ticker_list_df.head())

# Output to csv file

ticker_list_df.to_csv('ticker_list.csv', index=False, header=False)


  Symbol
1    AAL
2   AAME
3   AAOI
4   AAON
5   AAPL


In [889]:
quotes_df = pd.read_csv('./data/quotes.csv')

In [890]:
quotes_df.set_index('Date', inplace=True)
quotes_df = quotes_df.transpose()
quotes_df.reset_index(inplace=True)
quotes_df.rename(columns = {'index':'Symbol'}, inplace = True)
quotes_df.head()

Date,Symbol,2019-11-15,2019-11-18,2019-11-19,2019-11-20,2019-11-21,2019-11-22
0,AAL,28.860001,28.6,29.290001,28.23,27.93,28.68
1,AAME,1.75,1.57,1.72,1.91,2.0,1.95
2,AAOI,10.865,10.54,10.25,10.28,10.2,10.24
3,AAON,50.34,50.25,50.689999,49.540001,48.369999,48.389999
4,AAPL,265.76001,267.100006,266.290009,263.190002,262.01001,261.779999


In [891]:
# Remove stocks with value below $25

quotes_df = quotes_df[quotes_df['2019-11-15'] > 25]

# Check for missing values

quotes_df.isnull().any().sum()

0

In [892]:
# Calculate percentange change over timeframe

quotes_df['percent change'] = (quotes_df['2019-11-22'] - quotes_df['2019-11-15'])/quotes_df['2019-11-15'] * 100

In [893]:
quotes_df = pd.merge(quotes_df, nasdaq_list_df[['Symbol', 'Security Name']], on = 'Symbol', how ='inner')
quotes_df['Security Name'] = quotes_df['Security Name'].apply(lambda x: x.split(' -')[0].split(',')[0])

quotes_df.head()

Unnamed: 0,Symbol,2019-11-15,2019-11-18,2019-11-19,2019-11-20,2019-11-21,2019-11-22,percent change,Security Name
0,AAL,28.860001,28.6,29.290001,28.23,27.93,28.68,-0.623704,American Airlines Group
1,AAON,50.34,50.25,50.689999,49.540001,48.369999,48.389999,-3.873661,AAON
2,AAPL,265.76001,267.100006,266.290009,263.190002,262.01001,261.779999,-1.497596,Apple Inc.
3,AAWW,25.25,25.049999,25.299999,24.389999,24.41,25.01,-0.950495,Atlas Air Worldwide Holdings
4,AAXN,67.370003,67.720001,68.480003,71.980003,71.900002,72.089996,7.006075,Axon Enterprise


In [978]:
negative_tickers = quotes_df.sort_values('percent change')['Symbol'][:30]
negative_companies = quotes_df.sort_values('percent change')['Security Name'][:30]
positive_tickers = quotes_df.sort_values('percent change', ascending=False)['Symbol'][:30]
positive_companies = quotes_df.sort_values('percent change', ascending=False)['Security Name'][:30]

type(negative_tickers)

pandas.core.series.Series

In [895]:
Twitter=pickle.load(open('secret_twitter_credentials.pkl','rb'))

auth = twitter.oauth.OAuth(Twitter['Access Token'],
                           Twitter['Access Token Secret'],
                           Twitter['Consumer Key'],
                           Twitter['Consumer Secret'])

twitter_api = twitter.Twitter(auth=auth)

In [934]:
positive_text = []
number = 500

for stock in positive_tickers:
    q = stock
    
    search_results = twitter_api.search.tweets(q=q, count=number)

    statuses = search_results['statuses']
    for s in statuses:
        symbol = s['entities']['symbols']
        if len(symbol) == 1 and symbol[0]['text'] == stock:
            if not s["text"] in positive_text:
                positive_text.append(s['text'])

for company in positive_companies:
    q = company
    
    search_results = twitter_api.search.tweets(q=q, count=number)

    statuses = search_results['statuses']
    for s in statuses:
        if s['lang'] == 'en' and not s["text"] in positive_text:
            positive_text.append(s['text'])


In [936]:
negative_text = []
number = 500

for stock in negative_tickers:
    q = stock
    
    search_results = twitter_api.search.tweets(q=q, count=number)

    statuses = search_results['statuses']
    for s in statuses:
        symbol = s['entities']['symbols']
        if len(symbol) == 1 and symbol[0]['text'] == stock:
            if not s["text"] in negative_text:
                negative_text.append(s['text'])

for company in negative_companies:
    q = company
    
    search_results = twitter_api.search.tweets(q=q, count=number)

    statuses = search_results['statuses']
    for s in statuses:
        if s['lang'] == 'en' and not s["text"] in negative_text:
            negative_text.append(s['text'])

In [955]:
# with open('positive_text.txt', 'w', encoding="utf-8") as filehandle:
#     for listitem in positive_text:
#         filehandle.write(listitem)
        
# negative_text = []

# # open file and read the content in a list
# with open('negative_text.txt', 'r', encoding="utf-8") as filehandle:
#     for line in filehandle:
#         # remove linebreak which is the last character of the string
#         currentPlace = line[:-1]
#         # add item to the list
#         negative_text.append(currentPlace)
        

In [921]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\qcsbo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\qcsbo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [940]:
useless_words = nltk.corpus.stopwords.words("english") + list(string.punctuation)
useless_words.extend(['https', 'http'])

def bag_of_words_features(text):
    return {word:1 for word in text if not word in useless_words\
           and '/' not in word}

bag_of_words_features(nltk.word_tokenize(positive_text[0]))

positive_features = [
    (bag_of_words_features(nltk.word_tokenize(text)), 'pos') \
    for text in positive_text
]

negative_features = [
    (bag_of_words_features(nltk.word_tokenize(text)), 'neg') \
    for text in negative_text
]

print(f'Number of positive features: {len(positive_features)}')
print(f'Number of negative features: {len(negative_features)}')

Number of positive features: 1486
Number of negative features: 1300


In [974]:
random.shuffle(positive_features)
random.shuffle(negative_features)

split = int(1300 * .8)
sentiment_classifier = NaiveBayesClassifier.train(positive_features[:split]+negative_features[:split])

In [975]:
nltk.classify.util.accuracy(sentiment_classifier, positive_features[:split]+negative_features[:split])*100

98.9423076923077

In [976]:
nltk.classify.util.accuracy(sentiment_classifier, positive_features[split:]+negative_features[split:])*100

94.19263456090651

In [977]:
sentiment_classifier.show_most_informative_features()

Most Informative Features
         Pharmaceuticals = 1                 pos : neg    =     43.7 : 1.0
                 changed = 1                 pos : neg    =     33.7 : 1.0
               Materials = 1                 neg : pos    =     32.3 : 1.0
                    UBSI = 1                 neg : pos    =     21.7 : 1.0
                 Applied = 1                 neg : pos    =     19.4 : 1.0
                    Care = 1                 pos : neg    =     18.3 : 1.0
                     Key = 1                 pos : neg    =     17.0 : 1.0
                  market = 1                 pos : neg    =     15.9 : 1.0
                  Raised = 1                 pos : neg    =     11.0 : 1.0
              Industries = 1                 neg : pos    =     11.0 : 1.0
