In [1]:
import pandas_datareader.data as web
import datetime as dt
import math
import numpy as np
import pandas as pd
import ta
import tweepy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
import matplotlib.pyplot as plt
from ta.utils import dropna
import warnings
warnings.filterwarnings('ignore')

# Stock Selection

In [2]:
print('You can use: aapl,adbe,amzn,ctsh,fb,intc,msft,mu')

ticker = input('Ticker: ')

You can use: aapl,adbe,amzn,ctsh,fb,intc,msft,mu
Ticker: msft


# Sentiment Analysis

In [3]:
def sentimentScore(Tweet):
    analyzer = SentimentIntensityAnalyzer()
    results = []
    for sentence in Tweet:
        vs = analyzer.polarity_scores(sentence)
        results.append(vs)
    return results

# Stock Price and Technical Indicator Data

In [4]:
def dfStock(ticker,start,end):
    df = web.DataReader(ticker.upper(),'yahoo',start,end)
    data = ta.add_all_ta_features(df, "Open", "High", "Low", "Close", "Volume", fillna=True).reset_index()
    data = data[['Date','momentum_stoch','momentum_rsi','momentum_kama','trend_ema_fast','volume_vpt','Open','High','Low','Close','Volume','Adj Close']]
    data['forecast'] = data['Adj Close'].shift(1)
    data['Daily Return'] = (data['forecast'].pct_change())*100
    data['target'] = int(False)
    data.loc[data['Daily Return'] > 0, 'target']=int(True)
    data = data.dropna() 
    data = data.set_index('Date')
    return data

In [5]:
def mainDf(ticker):
    '''ticker inputs: aapl,adbe,amzn,ctsh,fb,intc,msft,mu: '''
    df1 = pd.read_excel(ticker.lower()+'Tweets.xlsx')
    df1['Tweet content']
    df_history_results = pd.DataFrame(sentimentScore(df1['Tweet content']))
    df_history_tweets = pd.merge(df1, df_history_results, left_index=True, right_index=True)
    df_history_tweets = df_history_tweets[['Date','Followers','Tweet content','neg','neu','pos','compound']]
    df_history_tweets['influenced'] = df_history_tweets['compound']*df_history_tweets['Followers']
    negative = df_history_tweets.groupby('Date')['neg'].mean()
    neutral = df_history_tweets.groupby('Date')['neu'].mean()
    positive = df_history_tweets.groupby('Date')['pos'].mean()
    compound = df_history_tweets.groupby('Date')['compound'].mean()
    influenced = df_history_tweets.groupby('Date')['influenced'].mean()
    dataframe = pd.DataFrame({'negative':negative, 'positive':positive, 'compound':compound, 'neutral':neutral, 'influenced':influenced})
    main_df = pd.merge(dataframe, dfStock(ticker.upper(),'2016-03-31','2016-06-15'), left_index=True, right_index=True)
    main_df = main_df.dropna()
    main_df = main_df.reset_index()
    main_df = main_df.drop(['Date','Daily Return','Adj Close','forecast','Open','Close','High','Low','Volume'], axis=1)
    main_df = main_df.dropna()
    return main_df

# Readying Data For Sklearn

In [6]:
train,test = sklearn.model_selection.train_test_split(mainDf(ticker.upper()), test_size=0.2, random_state = 222)
train,val = sklearn.model_selection.train_test_split(train, test_size=0.2, random_state = 222)
trainy = train['target']
del train['target']
valy = val['target']
del val['target']
testy = test['target']
del test['target']

In [7]:
#Support Vector Machine 
from sklearn import svm
model1 = sklearn.svm.SVC()
model1.fit(train,trainy)
model1.score(val,valy)

0.2222222222222222

In [8]:
#Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
model2 = sklearn.ensemble.RandomForestClassifier()
model2.fit(train,trainy)
model2.score(val,valy)

0.2222222222222222

In [9]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
model3 = sklearn.linear_model.LogisticRegression()
model3.fit(train,trainy)
model3.score(val,valy)

0.4444444444444444

In [10]:
#scores of each model (generally: higher the more accurate)
mod1prob = model1.score(test,testy)
mod2prob = model2.score(test,testy)
mod3prob = model3.score(test,testy)
print(model1.score(test,testy))
print(model2.score(test,testy))
print(model3.score(test,testy))

0.5454545454545454
0.9090909090909091
0.9090909090909091


# Data Needed For Prediction

In [11]:
#prediction dataframe for part of the features 
pred = dfStock(ticker.upper(),dt.datetime.now() - dt.timedelta(days=365*15),dt.datetime.now())[['momentum_stoch','momentum_rsi','momentum_kama','trend_ema_fast','volume_vpt']]

# Real Time Twitter Sentiment Data For Stock Cashtag

In [12]:
#these are unique to your created application on twitter developer
api_key = 'not shown for privacy protection'
api_secret = 'not shown for privacy protection'
access_token = 'not shown for privacy protection'
access_secret = 'not shown for privacy protection'

In [13]:
auth = tweepy.OAuthHandler( api_key,api_secret)
auth.set_access_token(access_token,access_secret)
api = tweepy.API(auth,wait_on_rate_limit = True)

In [14]:
def tweets(query):
    df = pd.DataFrame()
    data = []
    for tweet in tweepy.Cursor(api.search, q=query, rpp=100, tweet_mode="extended").items(1000): 
        date = tweet.created_at
        text = tweet.full_text.encode('unicode-escape').decode('utf-8')
        followers = tweet.user.followers_count              
        data.append({'Tweet':text, 'Followers':followers, 'Date':date })
    df = pd.DataFrame(data)
    df = df.set_index('Followers')
    df = df.sort_index(ascending = False)
    return df

In [15]:
df = tweets('$'+ticker.upper())
df=df.reset_index()
Tweet = df['Tweet']

In [16]:
df_results = pd.DataFrame(sentimentScore(Tweet))
df_tweets = pd.merge(df, df_results, left_index=True, right_index=True)
df_tweets = df_tweets[['Date','Followers','Tweet','neg','neu','pos','compound']]
df_tweets['influenced'] = df_tweets['compound']*df_tweets['Followers']
negative = df_tweets['neg'].mean()
neutral = df_tweets['neu'].mean()
positive = df_tweets['pos'].mean()
compound = df_tweets['compound'].mean()
influenced = df_tweets['influenced'].mean()
df = pd.DataFrame({'negative':[negative], 'positive':[positive], 'compound':[compound], 'neutral':[neutral], 'influenced':[influenced]})

In [17]:
p1 = df[val.columns[0]].iloc[-1]
p2 = df[val.columns[1]].iloc[-1]
p3 = df[val.columns[2]].iloc[-1]
p4 = df[val.columns[3]].iloc[-1]
p5 = df[val.columns[4]].iloc[-1]
p6 = pred[val.columns[5]].iloc[-1]
p7 = pred[val.columns[6]].iloc[-1]
p8 = pred[val.columns[7]].iloc[-1]
p9 = pred[val.columns[8]].iloc[-1]
p10 = pred[val.columns[9]].iloc[-1]

# Prediction

In [18]:
if mod1prob > mod2prob and mod1prob > mod3prob:
    prediction = model1.predict([[p1,p2,p3,p4,p5,p6,p7,p8,p9,p10]])
    if prediction == 1:
        print(f'{ticker.upper()} => Buy: There is a {mod1prob.round(4)*100}% confidence of having a positive daily return.')
    else:
        print(f'{ticker.upper()} => Sell: There is a {mod1prob.round(4)*100}% confidence of having a negative daily return.')

elif mod2prob > mod1prob and mod2prob > mod3prob:
    prediction = model2.predict([[p1,p2,p3,p4,p5,p6,p7,p8,p9,p10]])
    if prediction == 1:
        print(f'{ticker.upper()} => Buy: There is a {mod2prob.round(4)*100}% confidence of having a positive daily return.')
    else:
        print(f'{ticker.upper()} => Sell: There is a {mod2prob.round(4)*100}% confidence of having a negative daily return.')

else:
    prediction = model3.predict([[p1,p2,p3,p4,p5,p6,p7,p8,p9,p10]])
    if prediction == 1:
        print(f'{ticker.upper()} => Buy: There is a {mod3prob.round(4)*100}% confidence of having a positive daily return.')
    else:
        print(f'{ticker.upper()} => Sell: There is a {mod3prob.round(4)*100}% confidence of having a negative daily return.')

MSFT => Buy: There is a 90.91% confidence of having a positive daily return.


In [None]:
#model score might be too high due to small amount of trainging and testing data
#better results would result if a dataset became available with more than ~60 days of twitter data