# Data Wrangling - Stock market's Data

**Author**:  Sathish Manthani  (smanthani@my365.bellevue.edu)  
**Date**  :  02/25/2020  
**Course**:  DSC540 - Data Preparation  

## Import required libraries

In [1]:
# Importing requests, json modules which helps to connect to REST APIs and parse the data
import requests, json
# Importing datetime method for date conversion
from datetime import datetime
import logging #for logging connectivity
import configparser #configure the logger
import pandas as pd #for dataframes

# Logging configuration
logging.basicConfig(filename='./Stocks_data_pull_%s.log' %
                        datetime.strftime(datetime.now(), '%m%d%Y_%H%M%S'),
                        level=logging.DEBUG,
                        format='%(asctime)s %(message)s',
                        datefmt='%m-%d %H:%M:%S')

## Method to get list of stock tickers from API

In [2]:
#Import contextlib and csv modules to parse csv data
from contextlib import closing
import csv

def getListofTickers(tickers_url):
    '''
    This method takes REST API url and gets list of tickers. 
    API returns CSV file output and this method returns the list of tickers.
    '''
    logging.debug("[INFO]: Getting Tickers' data via API...")
    with closing(requests.get(tickers_url, stream=True)) as r:
        f = (line.decode('utf-8') for line in r.iter_lines())
        try:
            reader = csv.reader(f, delimiter=',', quotechar='"')
            next(reader, None)    
            tickers_list = []
            for row in reader:
                tickers_list.append(row[0])
        except requests.exceptions.HTTPError as exp:
            logging.debug('[ERROR]: Error connecting to the API.',str(exp))
            return str(exp)
    logging.debug('[INFO]: List of tickers fetched.')
    return tickers_list

## Method to get Stocks' data from API

In [3]:
def getStocksList(url,tickers):
    '''
    This method takes REST API url and tickers list as input and establishes connection. 
    Fetches response data and processes JSON data into Python dictionary and return the response.
    '''
    tickers_list = ','.join(tickers)
    logging.debug("[INFO]: Connecting to the Stocks' data API...")
    # Connecting to the API
    response = requests.get(url+tickers_list)
    logging.debug('[INFO]: Connected to the API successfully! Fetching data...')
    # Raise exception in case of connection error and return appropriate error message.
    try:
        response.raise_for_status()
    except requests.exceptions.HTTPError as exp:
        logging.debug('[ERROR]: Error connecting to the API.',str(exp))
        return str(exp) 
    
    # If the connection succeeds then read the fetched JSON string
    json_resp = response.json()
    #json_out = json.dumps(json_resp, indent=4)
    logging.debug('[INFO]: Stocks data fetch is complete.')
    return json_resp

## API variables

**I used two APIs for this exercise.  
One fetches the list of tickers.   
Another one gets the stocks data for these tickers.**

In [4]:
# API to get list of tickers
tickers_url = "https://pkgstore.datahub.io/core/nasdaq-listings/nasdaq-listed-symbols_csv/data/595a1f263719c09a8a0b4a64f17112c6/nasdaq-listed-symbols_csv.csv"

In [5]:
# API to get Stocks data for given tickers
stocks_url = "https://financialmodelingprep.com/api/v3/quote/"

## API calls and parsing the output

In [6]:
# Call the method to get list of tickers
tickers_list = getListofTickers(tickers_url)

In [45]:
tickers_list1 = ['AAPL','FB','OKTA', 'AMZN']

In [46]:
# Call the method to get stocks' data
# I'm only retrieving the data for a 1500 tickers. 
# Pls note its not necessary the data would be available for all the tickers.
json_resp = getStocksList(stocks_url,tickers_list1)

In [47]:
# Normalize the json output and store it as dataframe
from pandas.io.json import json_normalize
df = json_normalize(json_resp)
#json_normalize shuffles the columns. So,I ordered the columns as given in the API
df = df.reindex(columns=list(json_resp[0].keys()))

## Data preparation

### List of columns in the dataset

In [48]:
#list of columns
df.columns

Index(['symbol', 'name', 'price', 'changesPercentage', 'change', 'dayLow',
       'dayHigh', 'yearHigh', 'yearLow', 'marketCap', 'priceAvg50',
       'priceAvg200', 'volume', 'avgVolume', 'exhange', 'open',
       'previousClose', 'eps', 'pe', 'earningsAnnouncement',
       'sharesOutstanding', 'timestamp'],
      dtype='object')

### Rename the columns

In [49]:
# Rename the columns to more meaningful names
df = df.rename(columns = {"symbol":"Stock_Ticker", 
                     "price":"Price",
                     "changesPercentage":"Change_%",
                    "change":"Change_$",
                    "dayLow":"24hrs_Low",
                    "dayHigh":"24hrs_High",
                    "yearHigh":"52weeks_High",
                    "yearLow":"52weeks_Low",
                    "marketCap":"Market_Cap",
                    "priceAvg50":"Avg_50days_Price",
                    "priceAvg200":"Avg_200days_Price",
                     "volume":"Volume",
                     "avgVolume":"Avg_Volume",
                     "exhange":"Exchange",
                     "open":"Open",
                     "previousClose":"Previous_Close", 
                    "eps":"EPS", 
                    "pe":"PE",
                    "earningsAnnouncement":"Earnings_Date", 
                    "sharesOutstanding":"Outstanding_Shares", 
                    "timestamp":"Timestamp"})

### Check the datatypes of the columns

In [50]:
#data types
df.dtypes

Stock_Ticker           object
name                   object
Price                 float64
Change_%              float64
Change_$              float64
24hrs_Low             float64
24hrs_High            float64
52weeks_High          float64
52weeks_Low           float64
Market_Cap            float64
Avg_50days_Price      float64
Avg_200days_Price     float64
Volume                  int64
Avg_Volume              int64
Exchange               object
Open                  float64
Previous_Close        float64
EPS                   float64
PE                    float64
Earnings_Date          object
Outstanding_Shares      int64
Timestamp               int64
dtype: object

### Drop Null values

In [51]:
#Remove rows with null values
df = df.dropna(how='any') 
df.dropna(how='any', inplace=True)

### Convert Object to Date

In [52]:
def castAsDate(date_obj):
    '''
    Cast date object to Date datatype
    '''
    dt = pd.to_datetime(date_obj,infer_datetime_format=True).dt.date
    return dt

### Convert Date object to date

In [53]:
#Earnings date prior to conversion
df["Earnings_Date"].head()

0    2020-01-28T21:30:00.000+0000
1    2020-01-30T21:01:00.000+0000
2    2020-01-29T21:05:04.000+0000
Name: Earnings_Date, dtype: object

In [54]:
#Conversion
df["Earnings_Date"] = castAsDate(df["Earnings_Date"])
df["Earnings_Date"].head()

0    2020-01-28
1    2020-01-30
2    2020-01-29
Name: Earnings_Date, dtype: object

### Convert Market Cap to Millions 

In [55]:
df["Market_Cap"] = (df.Market_Cap/1000000).map(lambda x: '{:.2f}'.format(x))

### Convert Outstanding shares to 1000s

In [56]:
df["Outstanding_Shares"] = (df.Outstanding_Shares/1000).map(lambda x: '{:.0f}'.format(x))

## Display the data

In [57]:
#Display final output
df.head(50)

Unnamed: 0,Stock_Ticker,name,Price,Change_%,Change_$,24hrs_Low,24hrs_High,52weeks_High,52weeks_Low,Market_Cap,...,Volume,Avg_Volume,Exchange,Open,Previous_Close,EPS,PE,Earnings_Date,Outstanding_Shares,Timestamp
0,AAPL,Apple Inc.,288.95,-1.36,-3.97,281.23,290.79,327.85,170.27,1264294.95,...,56544246,37622985,NASDAQ,282.0,292.92,12.595,22.941645,2020-01-28,4375480,1583639824
1,AMZN,"Amazon.com, Inc.",1901.09,-1.19,-22.94,1869.5,1910.3167,2185.95,1626.01,946381.59,...,5273580,4238501,NASDAQ,1875.0,1924.03,23.01,82.62016,2020-01-30,498000,1583639824
2,FB,"Facebook, Inc.",181.02,-2.24,-4.15,176.27,183.78,224.2,159.28,515988.46,...,24559550,16515742,NASDAQ,178.33,185.17,6.43,28.152412,2020-01-29,2405750,1583639824


In [63]:
df[['Stock_Ticker','Price']]

Unnamed: 0,Stock_Ticker,Price
0,AAPL,288.95
1,AMZN,1901.09
2,FB,181.02
