Install yfinance

In [None]:
! pip install yfinance

Download stock data

In [None]:
import yfinance as yf
data = yf.download("AAPL IBM", start="2009-01-01", end="2019-12-31")

[*********************100%***********************]  2 of 2 completed


Check what is in the data

In [None]:
data['Open']

Unnamed: 0_level_0,AAPL,IBM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2009-01-02,3.067143,83.889999
2009-01-05,3.327500,86.419998
2009-01-06,3.426786,87.110001
2009-01-07,3.278929,87.830002
2009-01-08,3.229643,87.809998
...,...,...
2019-12-23,70.132500,135.779999
2019-12-24,71.172501,135.610001
2019-12-26,71.205002,134.979996
2019-12-27,72.779999,135.000000


Start the Hadoop cluster

In [None]:
%%bash
$HADOOP_HOME/sbin/start-dfs.sh
$HADOOP_HOME/sbin/start-yarn.sh

Starting namenodes on [localhost]
Starting datanodes
Starting secondary namenodes [p-d412799a-8ad0-4b98-adca-7b7c79c259ea]
Starting resourcemanager
Starting nodemanagers


Upload the finance data set to HDFS

In [None]:
from hdfs3 import HDFileSystem
hdfs = HDFileSystem(host='localhost', port=9000)
with hdfs.open('AAPL_IBM_open.csv', 'wb') as f:
    data['Open'].to_csv(f,header=True)

Check that the data set arrived at its final location

In [None]:
hdfs.ls('.')

Implementation of MapReduce job
(Here, the task was to give the maximum opening value per year for two companies in the fourth quarter of the year.)

In [None]:
%%file stock_analysis.py

from mrjob.job import MRJob

import re
import sys

class StockAnalysis(MRJob):

   def mapper(self, key, value):
      date, apple_open, ibm_open = value.split(',')
      #print(value, file=sys.stderr)
      year = date[:4]
      month = date[5:7]
      if (month=='10' or month=='11' or month=='12'):
         apple_key = 'apple_%s' % year
         ibm_key = 'ibm_%s' % year
         yield(apple_key, float(apple_open))
         yield(ibm_key, float(ibm_open))
      
   def reducer(self, key, values):
      yield(key, max(values))

if __name__ == '__main__':
   StockAnalysis.run()


Overwriting stock_analysis.py


Run MapReduce analysis job on cluster

In [None]:
!python stock_analysis.py -r hadoop hdfs:///user/root/AAPL_IBM_open.csv

No configs found; falling back on auto-configuration
No configs specified for hadoop runner
Looking for hadoop binary in /root/hadoop-3.3.1/bin...
Found hadoop binary: /root/hadoop-3.3.1/bin/hadoop
Using Hadoop version 3.3.1
Looking for Hadoop streaming jar in /root/hadoop-3.3.1...
Found Hadoop streaming jar: /root/hadoop-3.3.1/share/hadoop/tools/lib/hadoop-streaming-3.3.1.jar
Creating temp directory /tmp/stock_analysis.root.20211104.143802.189837
uploading working dir files to hdfs:///user/root/tmp/mrjob/stock_analysis.root.20211104.143802.189837/files/wd...
Copying other local files to hdfs:///user/root/tmp/mrjob/stock_analysis.root.20211104.143802.189837/files/
Running step 1 of 1...
  packageJobJar: [/tmp/hadoop-unjar14527474772028646990/] [] /tmp/streamjob7883098047006751013.jar tmpDir=null
  Connecting to ResourceManager at /0.0.0.0:8032
  Connecting to ResourceManager at /0.0.0.0:8032
  Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/root/.staging/job_1636035523910_0

Shutdown Hadoop cluster

In [None]:
%%bash
$HADOOP_HOME/sbin/stop-yarn.sh
$HADOOP_HOME/sbin/stop-dfs.sh

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=d412799a-8ad0-4b98-adca-7b7c79c259ea' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>