# fetch_tools

> Utilities that facilitate retrieval of data from the web

In [None]:
#| default_exp fetch_tools

In [None]:
#| export
from __future__ import annotations
import requests
from io import StringIO

import pandas as pd

In [None]:
TEST_URL = 'https://www.dropbox.com/s/96xo9f1twlu3525/firmquarter_2022q1.csv?raw=1'

In [None]:
#| export
def get_text_file_from_url (url, #Data at this url must be readable with pandas.read_csv
             nrows: int=None, #Get only the first `nrows` from the file. If None, gets the entire file
             delimiter: str=',',
             headers: dict=None, #Headers to pass to the request
             skiprows: int=None, #Skip the first `skiprows` rows
             encoding: str='utf-8', #Encoding to use when reading the file
             **pd_read_csv_kwargs,
    ) -> pd.DataFrame:
    "Gets the first `nrows` from the file found at `url`. Data at `url` must be separated by `delimiter` and be readable by pandas.read_csv"


    if nrows is None:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        data = response.content.decode(encoding)
        return pd.read_csv(StringIO(data), delimiter=delimiter, skiprows=skiprows, **pd_read_csv_kwargs)

    response = requests.get(url, stream=True, headers=headers)
    response.raise_for_status()
    if skiprows is not None: nrows += skiprows
    lines = []
    for i, line in enumerate(response.iter_lines()):
        if skiprows is not None and i < skiprows: continue
        if i >= nrows: break
        lines.append(line.decode(encoding))
    data = '\n'.join(lines)
    return pd.read_csv(StringIO(data), delimiter=delimiter, **pd_read_csv_kwargs)


In [None]:
df = get_text_file_from_url(TEST_URL, nrows=5, delimiter='\t')
df

Unnamed: 0,gvkey,date,PRisk,NPRisk,Risk,PSentiment,NPSentiment,Sentiment,PRiskT_economic,PRiskT_environment,...,Covid_Risk,SARS_Exposure,H1N1_Exposure,Zika_Exposure,Ebola_Exposure,Brexit_Exposure,Brexit_Neg_Sentiment,Brexit_Pos_Sentiment,Brexit_Net_Sentiment,Brexit_Risk
0,1004,2002q1,359.55072,2928.6014,168.98235,997.86415,5550.5807,469.39542,9001.563,6331.43,...,0,0,0,0,0,,,,,
1,1004,2002q2,0.0,0.0,0.0,1594.7321,-5656.6074,544.82417,0.0,0.0,...,0,0,0,0,0,,,,,
2,1004,2002q3,0.0,0.0,0.0,49.334494,-17818.418,318.47134,0.0,0.0,...,0,0,0,0,0,,,,,
3,1004,2003q3,0.0,0.0,0.0,2581.9441,81710.483,1314.8283,0.0,0.0,...,0,0,0,0,0,,,,,


In [None]:
df = get_text_file_from_url(TEST_URL, nrows=5, delimiter='\t', usecols=['gvkey','date', 'PRisk'])
df

Unnamed: 0,gvkey,date,PRisk
0,1004,2002q1,359.55072
1,1004,2002q2,0.0
2,1004,2002q3,0.0
3,1004,2003q3,0.0


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()