In [None]:
!pip install nltk==3.3
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import twitter_samples, stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk import FreqDist, classify, NaiveBayesClassifier
import numpy as np
import re, string, random
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('twitter_samples')
from google.colab import drive
drive.mount('/content/drive')
import matplotlib.pyplot as plt
import math

Collecting nltk==3.3
[?25l  Downloading https://files.pythonhosted.org/packages/50/09/3b1755d528ad9156ee7243d52aa5cd2b809ef053a0f31b53d92853dd653a/nltk-3.3.0.zip (1.4MB)
[K     |████████████████████████████████| 1.4MB 2.7MB/s 
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.3-cp36-none-any.whl size=1394471 sha256=905e806c39d728ed13c43dff802bcd7354cf174f873ab0ba817c5f0b4fab47c2
  Stored in directory: /root/.cache/pip/wheels/d1/ab/40/3bceea46922767e42986aef7606a600538ca80de6062dc266c
Successfully built nltk
Installing collected packages: nltk
  Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.3
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/word

In [None]:
def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

if __name__ == "__main__":

    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    text = twitter_samples.strings('tweets.20150430-223406.json')
    tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

    stop_words = stopwords.words('english')

    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    freq_dist_pos = FreqDist(all_pos_words)
    print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, 1)
                         for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, 0)
                         for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    train_data = dataset[:7000]
    test_data = dataset[7000:]

    classifier = NaiveBayesClassifier.train(train_data)

    print("Accuracy is:", classify.accuracy(classifier, test_data))

    print(classifier.show_most_informative_features(10))

    

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]
Accuracy is: 0.997
Most Informative Features
                      :) = True                1 : 0      =    999.1 : 1.0
                     sad = True                0 : 1      =     24.4 : 1.0
                     bam = True                1 : 0      =     21.9 : 1.0
                  arrive = True                1 : 0      =     20.9 : 1.0
                     x15 = True                0 : 1      =     20.1 : 1.0
                    sick = True                0 : 1      =     19.4 : 1.0
                    damn = True                0 : 1      =     15.5 : 1.0
                 welcome = True                1 : 0      =     12.0 : 1.0
                    glad = True                1 : 0      =     11.9 : 1.0
              appreciate = True                1 : 0      =     11.8 : 1.0
None


In [None]:
def stock_analysis(stock_tag, max_input_days, graph=False, pred=[]):
  num_days_list = list(range(1, max_input_days+1))
  print(stock_tag + ":")

  #read csv files
  stocks_df = pd.read_csv('/content/drive/My Drive/Stocks Data/' + stock_tag + '.csv')
  tweets_df = pd.read_csv('/content/drive/My Drive/Tweets Data/' + stock_tag + '.csv', dtype="string")

  #drop days from tweets df that are on weekends and holidays because the stock market is only open on weekdays
  tweets_dropped = pd.merge(stocks_df["Date"], tweets_df, on="Date").drop(columns=["Date"])

  #process tweets into sentiment values for a given day
  tweets_tokenized = tweets_dropped.applymap(lambda x: word_tokenize(x) if not pd.isnull(x) else x)
  tweets_noiseless = tweets_tokenized.applymap(lambda y: remove_noise(y) if not pd.isnull([y]).any() else y)
  tweets_classified = tweets_noiseless.applymap(lambda z: classifier.classify(dict([token, True] for token in z)) if not pd.isnull([z]).any() else z)
  sentiment = tweets_classified.mean(axis=1)

  #drop unnecessary stock price data and get preliminary features df and labels df
  prelim = pd.concat([stocks_df.drop(columns=["Date", "High", "Low", "Close"]), sentiment], axis=1)
  prelim.dropna(inplace=True)
  features_prelim = prelim[:len(prelim)-1]
  labels_prelim = prelim["Open"]

  #splits data and appends to give multiple days of input data
  def split_data(input_list, stagger):
    value_list_final = []
    for i in range(len(input_list) - stagger + 1):
      value_list = []
      for j in range(stagger):
        value_list.extend(input_list[i+j])
      value_list_final.append(value_list)
    return(value_list_final)

  #trains model and calculates RMSE and accuracy of whether it goes up or down
  def model(features, labels, delta_percent=False):
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.1, random_state=78)
    reg = sklearn.linear_model.LinearRegression()
    reg.fit(X_train, y_train)
    if not delta_percent:
      return(reg.score(X_test, y_test))
    else:
      pred_delta_vals = np.sign(np.array(reg.predict(features))-np.array(labels))
      real_delta_vals = np.sign(np.diff(np.append(np.array(labels), np.array([0]))))
      numequal = 0
      for i in range(len(pred_delta_vals)):
        if pred_delta_vals[i] == real_delta_vals[i]:
          numequal += 1
      return(numequal/len(pred_delta_vals))

  #predicts
  def model_pred(features, labels, pred):
    reg = sklearn.linear_model.LinearRegression().fit(features, labels)
    return(reg.predict(pred))

  #prints results
  if len(pred)==0:
    for i in num_days_list:
      print(str(i) + " days of input data:")
      labels = labels_prelim[i:]

      features_sentiment = np.array(split_data(np.array([sentiment[:len(sentiment)-1]]).T, i))
      print("RSME with sentiment only: " + str(model(features_sentiment, labels)))

      features_stock = np.array(split_data(np.array(features_prelim.drop(columns=0)), i))
      print("RSME with stock only: " + str(model(features_stock, labels)))

    
      features_stock_and_sentiment = np.array(split_data(np.array(features_prelim),i))
      print("RSME with stock and sentiment: " + str(model(features_stock_and_sentiment, labels)))

      print("Predicts whether it goes up or down with " + str(model(features_stock_and_sentiment, labels, True)*100) + "% accuracy\n")

    print("\n\n\n\n")

  else:
    labels = labels_prelim[max(num_days_list):]
    features_stock_and_sentiment = np.array(split_data(np.array(features_prelim),max(num_days_list)))
    print("RSME with stock and sentiment: " + str(model(features_stock_and_sentiment, labels)))
    print("Next days open price" + str(model_pred(features_stock_and_sentiment, labels, pred)))


  #graphs sentiment and stock price and predicted stock price
  if graph:
    labels_df = pd.DataFrame(labels, columns=["Stock Price"])
    predicted_df = pd.DataFrame(reg.predict(features_stock_and_sentiment), columns=["Predicted Price"])
    plot_df = pd.concat([labels_df, predicted_df])

    plt.figure(figsize=(15,5))
    plot_df.plot()
    plt.title(stock_tag)
    plt.ylabel("$ USD");
    plt.figure(figsize=(15,5))
    sentiment.plot()
    plt.title("Sentiment")
    plt.ylabel("Arbitrary Units")
    plt.xticks(ticks=[]);




In [None]:
#Past 3 Days Stock Prices (July 20, 21, 22 TSLA Open, Adj Close, and Volume, and Sentiment)
past_stock_data = [[1519.01, 1643.00, 17121400, 0.57, 1639.93, 1568.36, 16107200, 0.47, 1599.00, 1592.33,	14161080, 0.52]]
#Predicts  Open Price
stock_analysis("TSLA", 3, pred=past_stock_data)