# Scraping stock data

## Import correct packages

In [5]:
import numpy as np
import pandas as pd

import requests # for http requests
from bs4 import BeautifulSoup # for html parsing and scraping
import bs4

from fastnumbers import isfloat
from fastnumbers import fast_float
from multiprocessing.dummy import Pool as ThreadPool

import matplotlib.pyplot as plt
import seaborn as sns
import json
from tidylib import tidy_document # for tidying incorrect html

sns.set_style('whitegrid')
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## String to float conversion

In [6]:
def ffloat(string):
    if string is None:
        return np.nan
    if type(string) == float or type(string) == np.float64: # If it is already a float variable just return it #
        return string
    if type(string) == int or type(string) == np.int64:
        return string
    return fast_float(string.split(' ')[0].replace(',','').replace('%', ''), default = np.nan)

The above function will check if input is already float/int then return the same, else remove comma and % and then convert.

Then we create a similar function that does this same thing to a list.

In [7]:
def ffloat_list(string_list):
    return list(map(ffloat,string_list))

We also wish to remove multiple spaces within a string.

In [8]:
def remove_multiple_spaces(string):
    if type(string)==str:
        return ' '.join(string.split())
    return string

## Make HTTP request

In [9]:
response = requests.get("http://www.example.com/", timeout=240)
response.status_code
response.content

200

b'<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8" />\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 50px;\n        background-color: #fff;\n        border-radius: 1em;\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        body {\n            background-color: #fff;\n        }\n        div {\n            width: auto;\n            margin: 0 auto;\n            border-radius: 0;\n            padding: 1em;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\

Now we can prepare to read json content and parse it.

In [16]:
url = "https://jsonplaceholder.typicode.com/posts/1"
response = requests.get(url, timeout=240)
response.status_code
response.json()

page_response = response.json()
page_response.keys()

200

{'userId': 1,
 'id': 1,
 'title': 'sunt aut facere repellat provident occaecati excepturi optio reprehenderit',
 'body': 'quia et suscipit\nsuscipit recusandae consequuntur expedita et cum\nreprehenderit molestiae ut ut quas totam\nnostrum rerum est autem sunt rem eveniet architecto'}

dict_keys(['userId', 'id', 'title', 'body'])

In [19]:
from IPython.core.display import HTML
HTML("<b>Rendered HTML</b>")

In [20]:
response = requests.get("https://www.moneycontrol.com/india/stockpricequote/auto-2-3-wheelers/heromotocorp/HHM", timeout=240)
page_content = BeautifulSoup(response.content, "html.parser")
HTML(str(page_content.find("h1")))

In [22]:
response = requests.get("https://www.moneycontrol.com/india/stockpricequote/auto-2-3-wheelers/heromotocorp/HHM", timeout=240)
content = BeautifulSoup(response.content, "html.parser")

price_div = content.find("div",attrs={"id":'b_changetext'})
HTML(str(price_div))