# PrepareData.ipynb

## Read data from downloaded csv files and create dataframes.

NOTE: Company_Tweet.csv & Tweet.csv are very large data files could not upload them in git as account limit is max 100MB per file

Below are the steps followed to read data and store dataframes:
* To run this code download Resources Company_Tweet.csv & Tweet.csv from kaggle link https://www.kaggle.com/code/saadusama/twitter-s-impact-on-stock-market-prices/data and copy them in Resources folder
* Read tickers data from Resources/CompanyValues.csv, filter Tesla stock data and store it in a dataframe
* Read Twitter data from Company_Tweet.csv & Tweet.csv, filter tweets for Tesla and store in a dataframe

### Necessary imports

In [None]:
import pandas as pd
from pathlib import Path
from datetime import datetime
import pandas_ta as ta

### Read Ticker data
* Read stock data from Resources/CompanyValues.csv
* Filter dataframe to store only TSLA data
* Drop ticker_symbol column as it is not required anymore
* set index to day_date
* Review DataFrame

In [None]:
market_df = pd.read_csv(Path('./Resources/CompanyValues.csv'), index_col='day_date', parse_dates=True, infer_datetime_format=True)
tsla_stock_values_df = market_df[market_df["ticker_symbol"] == "TSLA"].dropna()
tsla_stock_values_df = tsla_stock_values_df.drop(["ticker_symbol"],axis=1)
tsla_stock_values_df = tsla_stock_values_df.sort_index()
tsla_stock_values_df = tsla_stock_values_df.loc['2015-01-01':]
# print("Display tsla_stock_values_df")
# display(tsla_stock_values_df.head())
# display(tsla_stock_values_df.tail())

### Read Twitter Data and prepare one DataFrame for TSLA tweets
* Read Tweets from Resources/Tweet.csv and review dataframe
* Read Resources/Company_Tweet.csv, to find tweets relevant for TSLA, and review dataframe
* Merge both dataframes on tweet_id to get the consolidated tweet data for TSLA
* Review merged dataframe
* Convert post_date to Datetime format

In [None]:
# Read tweets & company_twee data from csv
tweets_df = pd.read_csv(Path("Resources/Tweet.csv"))
company_tweets_df = pd.read_csv(Path("Resources/Company_Tweet.csv"))

# Filter TSLA tweets
tsla_tweets_df = company_tweets_df[company_tweets_df["ticker_symbol"] == "TSLA"]
tsla_tweets_df = pd.merge(tsla_tweets_df,tweets_df,on="tweet_id")

# Compute total_engagement for a tweet, and filter tweets that have total_engagement > 2
# If we take total_engagment > 3 or more we are losing day data, hence for better training models kept a threshold of 2
tsla_tweets_df["total_engagement"] = tsla_tweets_df["comment_num"] + tsla_tweets_df["retweet_num"] + tsla_tweets_df["like_num"]
tsla_tweets_df = tsla_tweets_df[tsla_tweets_df["total_engagement"] > 2]

# Convert post_date from int64 to datetime type
tsla_tweets_df.post_date=tsla_tweets_df.post_date.apply(lambda z:datetime.fromtimestamp(z))
# print("Display tsla_tweets_df")
# display(tsla_stock_values_df.head())
# display(tsla_stock_values_df.tail())

In [None]:
# tsla_tweets_df.to_csv("Resources/tsla_tweets.csv")

### Store dataframes in IPython's Database
This will help reuse the dataframes, without repeating the code.
* %store - stores variables, aliases and macros in IPython’s database.
* store TSLA tweets dataframe and stock market data in IPython's database

In [None]:
# Prepare stock dataframe with Technical Indicatoras

ta_df = tsla_stock_values_df.copy()

#Create daily_return column
ta_df['daily_return']= ta_df['close_value'].pct_change()

#Set up new column 'price_direction', where 0 if 'daily_return' < 0, 
#and 1 if 'daily_return' > 0. 
ta_df['price_direction']= 0
ta_df.loc[ta_df['daily_return'] >= 0, 'price_direction'] =1

# Calculate MACD values using the pandas_ta library
ta_df.ta.macd(close='close_value', fast=12, slow=26, signal=9, append=True)

# Calculate High-Low Percentage values using the pandas_ta library
ta_df['HL_PCT'] = (ta_df['high_value'] - ta_df['low_value']) / ta_df['close_value'] * 100.0

# Calculate RSI values using the pandas_ta library
ta_df['RSI'] = ta_df.ta.rsi(close= 'close_value', length= 14, scalar= 100)

# Calculate VPT values using the pandas_ta library
ta_df['PVT'] = ta_df.ta.pvt(close= 'close_value',volume= 'volume',drift= 1)

# Clean Nan
ta_df= ta_df.dropna()

ta_df=ta_df.drop(['daily_return','volume','open_value','high_value','low_value'],
           axis=1)
ta_df.head()

# View result
# print("Display ta_df")
# display(ta_df.head())
# display(ta_df.tail())

In [None]:
%store tsla_tweets_df
%store tsla_stock_values_df
%store ta_df

In [None]:
%run SentimentAnalysis.ipynb

In [None]:
%run Textblob_SA.ipynb