In [1]:
# Install libraries
!pip install yfinance prophet pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting yfinance
  Downloading yfinance-0.1.85-py2.py3-none-any.whl (29 kB)
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 45 kB/s 
Collecting requests>=2.26
  Downloading requests-2.28.1-py3-none-any.whl (62 kB)
[K     |████████████████████████████████| 62 kB 1.6 MB/s 
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 45.1 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845514 sha256=255666501a65f5781c58f6e5bc95187381decbdfdc2fe438f7118aaab876f851
  Stored in directory: /root/.cache/pip/wheels/42/59/f5/79a5bf931714dcd201b26025347785f087370a10a3329a899c
Successfully built pyspark
In

In [2]:
# Data processing 
import pandas as pd
import numpy as no

# Get time series data
import yfinance as yf

#Prophet model for time series forecast
from prophet import Prophet

#visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Multi processing 
from multiprocessing import Pool, cpu_count

# Spark
from pyspark.sql.types import *
from pyspark.sql.functions import pandas_udf, PandasUDFType

# Process bar
from tqdm import tqdm

# Tracking time 
from time import time



In [25]:
import plotly.express as px

In [3]:
!gdown --id 1cijWpm7_S80zrRUJsBhTFk9btQYXuRHv

Downloading...
From: https://drive.google.com/uc?id=1cijWpm7_S80zrRUJsBhTFk9btQYXuRHv
To: /content/launchit.csv
100% 53.0M/53.0M [00:00<00:00, 292MB/s]


In [19]:
# read the csv file

data = pd.read_csv('launchit.csv')
data['Date'] = pd.to_datetime(data['Date'])

data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,TCKR,sector,industry,country,growth_rate,target_median_price,target_mean_price,target_high_price
0,2019-12-19,11.0,11.96,10.5,10.7,10.7,89400.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0
1,2019-12-20,10.808,12.49,9.25,9.65,9.65,503000.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0
2,2019-12-23,8.79,8.79,7.25,7.81,7.81,117400.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0
3,2019-12-24,7.5,7.64,6.0,6.41,6.41,102800.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0
4,2019-12-26,6.42,7.72,6.42,7.41,7.41,78400.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0


In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294310 entries, 0 to 294309
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   Date                 294310 non-null  datetime64[ns]
 1   Open                 294310 non-null  float64       
 2   High                 294310 non-null  float64       
 3   Low                  294310 non-null  float64       
 4   Close                294310 non-null  float64       
 5   Adj Close            294310 non-null  float64       
 6   Volume               294310 non-null  float64       
 7   TCKR                 294310 non-null  object        
 8   sector               294310 non-null  object        
 9   industry             294310 non-null  object        
 10  country              294310 non-null  object        
 11  growth_rate          294310 non-null  float64       
 12  target_median_price  294310 non-null  float64       
 13  target_mean_pr

In [21]:
data.rename(columns = {'Date' : 'ds', 'growth_rate' : 'y'}, inplace = True)
data.head()

Unnamed: 0,ds,Open,High,Low,Close,Adj Close,Volume,TCKR,sector,industry,country,y,target_median_price,target_mean_price,target_high_price
0,2019-12-19,11.0,11.96,10.5,10.7,10.7,89400.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0
1,2019-12-20,10.808,12.49,9.25,9.65,9.65,503000.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0
2,2019-12-23,8.79,8.79,7.25,7.81,7.81,117400.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0
3,2019-12-24,7.5,7.64,6.0,6.41,6.41,102800.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0
4,2019-12-26,6.42,7.72,6.42,7.41,7.41,78400.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0


In [22]:
data['month'] = data['ds'].dt.month

In [24]:
data.head(2)

Unnamed: 0,ds,Open,High,Low,Close,Adj Close,Volume,TCKR,sector,industry,country,y,target_median_price,target_mean_price,target_high_price,month
0,2019-12-19,11.0,11.96,10.5,10.7,10.7,89400.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0,12
1,2019-12-20,10.808,12.49,9.25,9.65,9.65,503000.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0,12


In [26]:
fig = px.line(data, x='ds',y='y', title='Stock Viz')

fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1y", step="year",stepmode="backward"),
            dict(count=2, label="3y", step="year",stepmode="backward"),
            dict(count=3, label="5y", step="year",stepmode="backward"),
            
            ])
    )
)
fig.show()

Output hidden; open in https://colab.research.google.com to view.

In [6]:
# Group the data by ticker
groups_by_ticker = data.groupby('TCKR')

# Check the groups in the dataframe confirming # of tickers
groups_by_ticker.groups.keys()

dict_keys(['ABCL', 'ABCM', 'ABNB', 'ABSI', 'ABST', 'ACCD', 'ACI', 'ACON', 'ACT', 'ACVA', 'ADAG', 'ADPT', 'AFCG', 'AFIB', 'AFRM', 'AFYA', 'AGFY', 'AGL', 'AGTI', 'AHI', 'AI', 'AIH', 'AIP', 'AIRS', 'AKA', 'AKYA', 'ALEC', 'ALGM', 'ALGS', 'ALHC', 'ALKT', 'ALRS', 'AMAM', 'AMK', 'AMPL', 'AMWL', 'ANGN', 'API', 'APP', 'ARBK', 'AREC', 'ARHS', 'ARIS', 'ARRY', 'ASAN', 'ASO', 'AUVI', 'AVAH', 'AVDX', 'AVO', 'AVTR', 'AYLA', 'AZEK', 'AZYO', 'BASE', 'BBIO', 'BCYC', 'BDSX', 'BEAM', 'BEKE', 'BHG', 'BIGC', 'BILL', 'BIOR', 'BIRD', 'BLI', 'BLND', 'BLZE', 'BMBL', 'BNL', 'BNR', 'BNTX', 'BQ', 'BRBR', 'BRDG', 'BRLT', 'BROS', 'BRP', 'BRZE', 'BSY', 'BVS', 'BWAY', 'BWMN', 'BYND', 'BZ', 'CADL', 'CAN', 'CCCC', 'CD', 'CDAK', 'CDRE', 'CERT', 'CFB', 'CFLT', 'CHNG', 'CHWY', 'CIAN', 'CINT', 'CMBM', 'CMTG', 'CNM', 'CNTG', 'CNVY', 'COCO', 'COIN', 'COMP', 'COOK', 'COUR', 'CPNG', 'CRBU', 'CRCT', 'CRDO', 'CRSR', 'CRWD', 'CSTL', 'CTKB', 'CURV', 'CVAC', 'CVRX', 'CWAN', 'CXM', 'DADA', 'DAO', 'DASH', 'DCBO', 'DCT', 'DDI', 'DDL', 

In [15]:
unique_tickers = data.TCKR.unique()

In [None]:
ticker_list = unique_tickers.tolist()
ticker_list

In [7]:
# Define Prophet function

def train_and_forecast(group):
  # Initiate the model
  m = Prophet()

  # Fit the model
  m.fit(group)

  # Make predictions
  future = m.make_future_dataframe(periods=30)
  forecast = m.predict(future)[['ds','yhat','yhat_lower','yhat_upper']]
  forecast['TCKR'] = group['TCKR'].iloc[0]

  # Return the forecasted results
  return forecast[['Date','yhat','yhat_lower','yhat_upper']]


In [18]:
# Multiple time series forecast using for-loop
# Start time
start_time = time()

# Create an empty DF
for_loop_forecast = pd.DataFrame

# Loop through each ticker
for ticker in ticker_list:
  # Get the data for the ticker
  group = groups_by_ticker.get_group(ticker)
  # Make forecast
  forecast = train_and_forecast(group)
  # Add the forecast results to the DF
  for_loop_forecast = pd.concat((for_loop_forecast, forecast))

print('The time used for the for-loop forecast is ', time()-start_time)



ValueError: ignored