In [None]:
# Useful starting lines
import numpy as np
import seaborn as sns
from util.dataloader import *
from util.finance import *
from util.plots import *
from util.finance import stock, compare
from util.quotebankexploration import *
from util.wikipedia import *
import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px
import numpy as np

from util.apple_stores import *

pio.renderers.default = "notebook_connected"
%load_ext autoreload
%autoreload 2

In [None]:
from task1 import *
from task2 import *
from task3 import *
from task4 import *

# Introduction
This notebook is intended to show and demonstrate the thought process that went into this project, on identifying patterns and correlations in stock markets and media quotes. In particular we have focused on the Apple Stock market, as it is one of the most valuable company on earth and it is widely covered in the media. 

------

# Loading the datasets
In this project we have studied three datasets to provide more insights in the stock price evolution, the coverage of Apple in the media, and its relationship with the various speakers. This first session is dedicated to loading the various datasets and preprocessing the valuable information.

## Apple stock : yFinance API
The yFinance API is provided by Yahoo Finance and provides an easy access to various financial metrics for most stocks in the market. The _ticker_ of the Apple stock is denoted _AAPL_, and we will first focus on a date range from 2010 up to 2020. We also provide an additional indicator of the daily volatility with the ``Liquidity`` field, which is the daily volume multiplied by the average daily stock price. This indicator in dollars is appropriate to have a quick overview on the quantity of Apple stock traded in a day, as a day of high liquidity is also said to be a day of high volatility. 

In [None]:
# Using yFinance API we load various metrics of the Apple stock ranging from 2008 to 2020
stock = load_stock("AAPL", 2015, 2020)

# We set a day of high volatility as a day among the highest 2% of liquidity in that year.
stock = high_volatility(stock, quantile = 0.98)

display(stock)

## Quotebank Dataset
The Quotebank data is a large text corpus of more than 178 millions quotations scrapped over 337 websites. As we are focusing on Apple related quotations, we have applied a various amount of filtering to reduce the number of quotes to 310'816. Some of the various techniques employed: 
1. White list of words that should be contained in quotes : _Apple_, _iPhone_, _Macbook_ etc.
2. Black list of words that should not be contained : _Mac n Cheese, _apple_, _Big Apple_ etc.
3. White list of speakers, as we also included speakers related to Apple regardless of the aforementioned white words : _Steve Jobs_, _Tim Cook_, _Steve Wozniak_ etc.

This final dataset of Apple related quotes have been saved in a `pkl` file to improve the ease of manipulation, and can be accessed with the `get_filtered_quotes` function.

In [None]:
quotes = get_filtered_quotes()

display(quotes)

## Wikipedia

wallah je sais pas

In [None]:
print('caca yolo prout')

---

# 1. First look into the Apple stock market and related quotes 

In [None]:
weekly_liquidity(stock, quantile=0.98)

In [None]:
daily_stock_price(stock)

In [None]:
quotes.date

In [None]:
daily_quotes(quotes[quotes.date.dt.year >= 2015], quantile = 0.98)

In [None]:
seasonal_analysis(stock[stock.Date.dt.year.isin([2018,2019])], column="Liquidity")

In [None]:
stock_price_with_quotes(stock,quotes)

In [None]:
pearson_stock_quotes(stock[stock.Date.dt.year.isin(range(2015,2019))],quotes)

----
# 4. Building a model for stock market prediction
Using the quotes related to Apple, speakers data and past stock performance, we can perform a first attempt at predicting the daily stock price and the liquidity. For this section we will use the Facebook's Prophet library, which provides powerful and easy to use forecasting tools. At its core, the model is a modular linear regression model, that can take into account past performance and additional factors. 

In [None]:
quotes_sentiment = pd.read_pickle("data/quotes_score.pkl")

prediction_frame = build_prediction_frame(stock[stock.Date.dt.year.isin(range(2015,2018))],quotes_sentiment)

m = fit_prophet(Prophet(changepoint_prior_scale=0.05, seasonality_prior_scale=0.1), prediction_frame, features=['positive','negative','total'], response='Open')
pred = predict_future(m,prediction_frame,feature_frame=quotes_sentiment)

plot_plotly(m,pred)

In [None]:
plot_prediction(stock, quotes_sentiment,pred)

In [None]:
# Performance evaluation
df_cv = cross_validation(m, initial='150 days', period='30 days', horizon = '60 days',parallel="processes")
df_p = performance_metrics(df_cv)

print("Mean absolute percentage error in a first week horizon", df_p["mape"].values[0])


In [None]:
times_series_predict(stock, quotes_sentiment, features = None, response = 'Open')

In [None]:
param_grid = {  
  'changepoint_prior_scale': np.linspace(0.2,2,10) ,
  'seasonality_prior_scale': np.logspace(-2,1,10),
}

tuning_results = prophet_cross_validation(param_grid, stock, quotes_sentiment, features = ['positive','negative','total'], response = 'Open', metric = 'mape')
tuning_results[tuning_results.mape == tuning_results.mape.min()]