In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

I have always been curious of how tweets can affect a company's share price. If you have the same question in mind, I hope to address it in this notebook. 

*P.S I am still fairly new to this and I would welcome any form of feedback as to how to improve my skill or approach to this question*

**Importing the essential libraries**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy import stats as stat

In [None]:
cpy = pd.read_csv('../input/tweets-about-the-top-companies-from-2015-to-2020/Company.csv')
cpy_tweet = pd.read_csv('../input/tweets-about-the-top-companies-from-2015-to-2020/Company_Tweet.csv')
tweet = pd.read_csv('../input/tweets-about-the-top-companies-from-2015-to-2020/Tweet.csv')

**Inspecting the dataframes**

In [None]:
cpy

In [None]:
cpy_tweet

In [None]:
tweet

After inspecting the dataframes, I decided to merge the tweet dataframe with the cpy_tweet dataframe which will allow me to identify the company that is talked about in the tweet

In [None]:
tweets = pd.merge(tweet,cpy_tweet,on='tweet_id',how='inner')
tweets.head()

In [None]:
tweets['post_date'] = pd.to_datetime(tweets['post_date'], unit='s')

In [None]:
tweets['date'] = pd.to_datetime(tweets['post_date'].apply(lambda date: date.date()))

In [None]:
tweets['date'].head()

In [None]:
tweets.info()

In [None]:
tweets = tweets.drop(['tweet_id'],axis=1)

The "writer" column is the only column with missing data. Since we are not likely to use this column, I decided to ignore the missing value for now.

In [None]:
tweets.isna().sum()

In [None]:
tweets.ticker_symbol.value_counts()

In [None]:
aapl = tweets[tweets['ticker_symbol'] == 'AAPL']
tsla = tweets[tweets['ticker_symbol'] == 'TSLA']
amzn = tweets[tweets['ticker_symbol'] == 'AMZN']
msft = tweets[tweets['ticker_symbol'] == 'MSFT']
goog = tweets[tweets['ticker_symbol'] == 'GOOG']
googl = tweets[tweets['ticker_symbol'] == 'GOOGL'] ## With stockholders voting rights

In [None]:
!pip install yfinance

In [None]:
import yfinance as yf

In [None]:
tsla_stock = yf.Ticker('TSLA')

In [None]:
start = min(tsla['date'])
end = max(tsla['date'])

tsla_stock = tsla_stock.history(start=start.date(), end=end.date())

In [None]:
tsla_stock

In [None]:
aapl_stock = yf.Ticker("AAPL").history(start=min(aapl['date']).date(),end=max(aapl['date']).date())
amzn_stock = yf.Ticker("AMZN").history(start=min(amzn['date']).date(),end=max(amzn['date']).date())
googl_stock = yf.Ticker("GOOGL").history(start=min(googl['date']).date(),end=max(googl['date']).date())
msft_stock = yf.Ticker("MSFT").history(start=min(msft['date']).date(),end=max(msft['date']).date())

Firstly, I would like to find out if the amount of tweets affects the volume traded of the particular company. To do that,I made us of panda shift operator to shift the entire timeseries back by 1 period. By doing so, I will be able to compare the effects of the volume of tweets the day before on the particular company share price the following day. 

I decided to use the Spearman correlation statistic test instead of the Pearson correlation because Spearman correlation test does not assume that the dataset are normally distributed. 

* null hypothesis: There is no correlation between the volume of tweets with the volume of stock traded
* alternate hypothesis: There is a correlation between the volume of tweets with the volume of stock traded

If the p-value of the spearman correlation, falls below the pre-determined threshold of 0.05, I will reject the null hypothesis and we will have enough evidence to conclude that there is a positive/negative correlation between the volume of stocks traded and the amount of tweets.

Beside the statistic test, I have also decided to plot the volume traded and the volume of tweets in the same graph to visualise the data for a overview of the relationship between these 2 variables. To get a clearer picture of how these two variables correlates, I had to use the rolling operator to get the average of 30 days window to have a clearer senses of the overall trend. 


In [None]:
def tweet_vol_affect(tweets,stocks,title):
    tweet_shift1 = tweets.groupby('date').size().shift(-1).dropna() 
    stock_data = stocks['Volume'].dropna()
    corr, pval = stat.spearmanr(tweet_shift1.reindex(stock_data.index), stock_data,nan_policy='omit')
    tweets_vol = tweets.groupby('date').size().rolling(30).mean().dropna()
    stocks_data = stocks['Volume'].rolling(30).mean().dropna()
    fig = plt.figure(figsize=(16,8))
    ax1 = fig.add_subplot()
    ax2 = fig.add_subplot()
    ax2 = ax1.twinx()
    ax1.plot(tweets_vol.index,tweets_vol,label='Tweet Volume')
    ax2.plot(stocks_data.index,stocks_data,color='orange',label='Trade Volume')
    ax2.set_title(title+" \n Spearman correlation: corr={0:.5f} pval={1:.5f}".format(corr,pval))
    
    lines, labels = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax2.legend(lines + lines2, labels + labels2, loc=0)
    plt.show()
    
  

In [None]:
tweet_vol_affect(tsla,tsla_stock,"Tesla")    

In [None]:
tweet_vol_affect(aapl,aapl_stock,"Apple")

In [None]:
tweet_vol_affect(amzn,amzn_stock,"Amazon")


In [None]:
tweet_vol_affect(googl,googl_stock,"Google")

In [None]:
tweet_vol_affect(msft,msft_stock,"Microsoft")

Looking at it, it appears that the shear volume of tweets does seems to have a positive correlation with the trade volume. However, the strength of the correlation is questionable. Additionally, it is not certain that volume of tweets is always correlated with the share price as demostrated in Mircosoft's case where the p-value exceeded the predetermined threshold of 0.05

# **Classifying positive and negative tweets**

The next hypothesis that I will be testing are as follows:
* Null Hypothesis: The sentiment of the tweet has no correlation with the shareprice of the company 
* Alternate Hypothesis: The sentiment of the tweet has a correlation with the shareprice of the company

In order to identify the sentiment of the tweets, I will be using the Afinn library. 

More information can be found [here.](http://pypi.org/project/afinn/)

In general, the afinn library will be able to provide a score where 0 is neutral, negative value would mean that the tweet is negative while postive would be otherwise.

In [None]:
!pip install afinn

In [None]:
from afinn import Afinn
afinn = Afinn()

In [None]:
tsla['score'] =  tsla['body'].apply(lambda tweet: afinn.score(tweet))
aapl['score'] =  aapl['body'].apply(lambda tweet: afinn.score(tweet))
amzn['score'] =  amzn['body'].apply(lambda tweet: afinn.score(tweet))
msft['score'] =  msft['body'].apply(lambda tweet: afinn.score(tweet))
googl['score'] =  googl['body'].apply(lambda tweet: afinn.score(tweet))

In [None]:
tsla.score.plot(kind='hist',range=(-5,5),bins=40,edgecolor='black');

In [None]:
amzn.score.plot(kind='hist',range=(-5,5),bins=40,edgecolor='black');

In [None]:
googl.score.plot(kind='hist',range=(-5,5),bins=40,edgecolor='black');

In [None]:
msft.score.plot(kind='hist',range=(-5,5),bins=40,edgecolor='black');

Next, I will be adopting the same method as aforementioned to test my hypothesis

In [None]:
def sentiment_overtime(tweets,stock,title):
    visual= tweets.groupby('date')['score'].mean().shift(-1).rolling(30).mean().dropna()
    corr_test = tweets.groupby('date')['score'].mean().shift(-1).dropna()
    corr, pval = stat.spearmanr(corr_test.reindex(stock.index), stock['Open'],nan_policy='omit')

    fig = plt.figure(figsize=(16,8))
    ax1 = fig.add_subplot()
    ax2 = fig.add_subplot()
    ax2 = ax1.twinx()
    
    ax1.plot(visual.index,visual,label='Tweets Sentiment')
    ax2.plot(stock.index,stock['Close'],color='orange',label='share price')
    ax2.set_title("Effects of "+title+" tweets to shareprice" +"\n Spearman correlation: corr={0:.5f} pval={1:.5f}".format(corr,pval))
    lines, labels = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax2.legend(lines + lines2, labels + labels2, loc=0)
    plt.show()
    

In [None]:
sentiment_overtime(tsla,tsla_stock,"Tesla");

In [None]:
sentiment_overtime(amzn,amzn_stock,"Amazon");

In [None]:
sentiment_overtime(msft,msft_stock,"Mircosoft");

In [None]:
sentiment_overtime(googl,googl_stock,"Google");

In [None]:
sentiment_overtime(aapl,aapl_stock,"Apple");

There seems to be a stronger correlation between the sentiment of the tweets to the share price of the company as compared to the previous hypothesis. However, the result shown on Tesla dataset appears to show a different result from the rest much like how Microsoft result was different in the previous hypothesis.

In conclusion, although 4/5 of the companies in this notebook have shown positive result but the strength of the correlation differs across the different company. On top of that, these companies may not be a good representative of the other stocks out there in the market. Further research needs to be done, to prove the viability of using tweet to determine the stock movement of the company.