In [12]:
import pandas as pd
import numpy as np
import os
from google.cloud import storage
from google.cloud import bigquery
import zipfile
from tqdm import tqdm
import numpy as np

In [106]:
# Setting up credintials for google cloud API
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/Users/shayaan/ficc/eng-reactor-287421-112eb767e1b3.json"
PROJECT_ID = "eng-reactor-287421"
TABLE_ID = "eng-reactor-287421.mo_test.spRatingData"

In [42]:
table_headers = ["ratingDetailId",
                 "entitySymbolValue",
                 "instrumentSymbolValue",
                 "securitySymbolValue",
                 "objectTypeId",
                 "orgDebtTypeCode",
                 "ratingTypeCode",
                 "currentRatingSymbol",
                 "ratingSymbol",
                 "ratingDate",
                 "creditwatch",
                 "outlook",
                 "creditwatchDate",
                 "outlookDate",
                 "priorRatingSymbol",
                 "priorCreditwatch",
                 "priorOutlook",
                 "ratingQualifier",
                 "regulatoryIndicator",
                 "regulatoryQualifier",
                 "ratingActionWord",
                 "CWOLActionWord",
                 "cwolInd",
                 "maturityDate",
                 "CUSIP",
                 "CINS",
                 "ISIN"]

In [3]:
def getFile(bucket, filename):
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket)
    blob = bucket.blob(filename)
    path_to_zip_file = 'zipfiles/rating.zip'
    path_to_extracted_file = 'extractedfiles/rating'
    
    print('Downloading File')
    blob = blob.download_to_filename(path_to_zip_file)
    print('Download Completed')
    # Deleting storage client
    del storage_client
    
    print('Unzipping files')
    with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
        zip_ref.extractall(path_to_extracted_file)
    print('Unzip completed')
    
    print('Reading file')
    with open(path_to_extracted_file + '/spRatingData.txt','r') as file:
        rating_data = file.read()
    print('File read')
    
    return rating_data

In [100]:
def parseFile(file):
    print("Beginning Parsing")
    file = file.split("#@#@#")
    data = []
    print("Number of rows:{}".format(len(file)))
    count = 0
    for element in tqdm(file):
        element = element.split("'")
        element = list(filter(lambda a: a != '', element))
        ind = 0
        flag = False
        parsed_data = []
        while ind < len(element):
            if element[ind] == '~' and flag == False:
                flag = True
                ind += 1
                continue

            if element[ind] == '~' and flag == True: 
                parsed_data.append(np.nan)

            elif element[ind] != '~':
                flag = False
                parsed_data.append(element[ind])

            ind += 1

        #Special case of last element being null
        if len(element) > 0 and element[-1] == '~':
            parsed_data.append(np.nan)

        data.append(parsed_data)
    parsed_dataframe = pd.DataFrame(data=data,columns=table_headers)

    # Converting data to right datatype
    parsed_dataframe.ratingDetailId = parsed_dataframe.ratingDetailId.astype(np.int64)
    parsed_dataframe.entitySymbolValue = parsed_dataframe.entitySymbolValue.astype(str)
    parsed_dataframe.instrumentSymbolValue = parsed_dataframe.instrumentSymbolValue.astype(str)
    parsed_dataframe.securitySymbolValue = parsed_dataframe.securitySymbolValue.astype(str)
    parsed_dataframe.objectTypeId = parsed_dataframe.objectTypeId.astype(np.int64)
    parsed_dataframe.orgDebtTypeCode = parsed_dataframe.orgDebtTypeCode.astype(str)
    parsed_dataframe.ratingTypeCode = parsed_dataframe.ratingTypeCode.astype(str)
    parsed_dataframe.currentRatingSymbol = parsed_dataframe.currentRatingSymbol.astype(str)
    parsed_dataframe.ratingSymbol = parsed_dataframe.ratingSymbol.astype(str)
    parsed_dataframe.ratingDate = pd.to_datetime(parsed_dataframe.ratingDate)
    parsed_dataframe.creditwatch = parsed_dataframe.creditwatch.astype(str)
    parsed_dataframe.outlook = parsed_dataframe.outlook.astype(str)
    parsed_dataframe.creditwatchDate = pd.to_datetime(parsed_dataframe.creditwatchDate)
    parsed_dataframe.outlookDate = pd.to_datetime(parsed_dataframe.outlookDate)
    parsed_dataframe.priorRatingSymbol = parsed_dataframe.priorRatingSymbol.astype(str)
    parsed_dataframe.priorCreditwatch = parsed_dataframe.priorCreditwatch.astype(str)
    parsed_dataframe.priorOutlook = parsed_dataframe.priorOutlook.astype(str)
    parsed_dataframe.ratingQualifier = parsed_dataframe.ratingQualifier.astype(str)
    parsed_dataframe.regulatoryIndicator = parsed_dataframe.regulatoryIndicator.astype(str)
    parsed_dataframe.regulatoryQualifier = parsed_dataframe.regulatoryQualifier.astype(str)
    parsed_dataframe.ratingActionWord = parsed_dataframe.ratingActionWord.astype(str)
    parsed_dataframe.CWOLActionWord = parsed_dataframe.CWOLActionWord.astype(str)
    parsed_dataframe.cwolInd = parsed_dataframe.cwolInd.astype(str)
    parsed_dataframe.maturityDate = pd.to_datetime(parsed_dataframe.maturityDate)
    parsed_dataframe.CUSIP = parsed_dataframe.CUSIP.astype(str)
    parsed_dataframe.CINS = parsed_dataframe.CINS.astype(str)
    parsed_dataframe.ISIN = parsed_dataframe.ISIN.astype(str)
    
    print('Parsing Completed')
    return parsed_dataframe


In [111]:
def getSchema():
    schema = [
                bigquery.SchemaField("ratingDetailId", "INT64",mode="REQUIRED"),
                bigquery.SchemaField("entitySymbolValue","STRING"),
                bigquery.SchemaField("instrumentSymbolValue","STRING",mode="NULLABLE"),
                bigquery.SchemaField("securitySymbolValue","STRING",mode="NULLABLE"),
                bigquery.SchemaField("objectTypeId","INT64", mode="NULLABLE"),
                bigquery.SchemaField("orgDebtTypeCode","STRING",mode="NULLABLE"),
                bigquery.SchemaField("ratingTypeCode","STRING", mode="NULLABLE"),
                bigquery.SchemaField("currentRatingSymbol","STRING",mode="NULLABLE"),
                bigquery.SchemaField("ratingSymbol","STRING", mode="NULLABLE"),
                bigquery.SchemaField("ratingDate","TIMESTAMP", mode="NULLABLE"),
                bigquery.SchemaField("creditwatch","STRING",mode="NULLABLE"),
                bigquery.SchemaField("outlook","STRING",mode="NULLABLE"),
                bigquery.SchemaField("creditwatchDate","TIMESTAMP",mode="NULLABLE"),
                bigquery.SchemaField("outlookDate","TIMESTAMP",mode="NULLABLE"),
                bigquery.SchemaField("priorRatingSymbol","STRING",mode="NULLABLE"),
                bigquery.SchemaField("priorCreditwatch","STRING",mode="NULLABLE"),
                bigquery.SchemaField("priorOutlook","STRING",mode="NULLABLE"),
                bigquery.SchemaField("ratingQualifier","STRING",mode="NULLABLE"),
                bigquery.SchemaField("regulatoryIndicator","STRING",mode="NULLABLE"),
                bigquery.SchemaField("regulatoryQualifier","STRING",mode="NULLABLE"),
                bigquery.SchemaField("ratingActionWord","STRING",mode="NULLABLE"),
                bigquery.SchemaField("CWOLActionWord","STRING",mode="NULLABLE"),
                bigquery.SchemaField("cwolInd","STRING",mode="NULLABLE"),
                bigquery.SchemaField("maturityDate","TIMESTAMP",mode="NULLABLE"),
                bigquery.SchemaField("CUSIP","STRING",mode="NULLABLE"),
                bigquery.SchemaField("CINS","STRING",mode="NULLABLE"),
                bigquery.SchemaField("ISIN","STRING",mode="NULLABLE")
            ]
    return schema
    
def uploadData(parsed_dataframe):
    client = bigquery.Client(project=PROJECT_ID, location="US")
    job_config = bigquery.LoadJobConfig(schema = getSchema(),
                                       write_disposition="WRITE_APPEND"
                                       )
    job = client.load_table_from_dataframe(parsed_dataframe, TABLE_ID,job_config=job_config)

    try:
        job.result()
        print("Upload Successful")
    except Exception as e:
        print("Failed to Upload")
        raise e

    

In [5]:
file = getFile('edx-spglobal-rating-history','Products/RXSPGRatingsPF2007Daily/RXSPGRatingsPF2007DailyFull20210214094505.zip')

Downloading File
Download Completed
Unzipping files
Unzip completed
Reading file
File read


In [107]:
parsed_dataframe = parseFile(file[:10000])

100%|██████████| 56/56 [00:00<00:00, 35172.36it/s]

Beginning Parsing
Number of rows:56
Parsing Completed





In [112]:
parsed_dataframe.head()

Unnamed: 0,ratingDetailId,entitySymbolValue,instrumentSymbolValue,securitySymbolValue,objectTypeId,orgDebtTypeCode,ratingTypeCode,currentRatingSymbol,ratingSymbol,ratingDate,...,ratingQualifier,regulatoryIndicator,regulatoryQualifier,ratingActionWord,CWOLActionWord,cwolInd,maturityDate,CUSIP,CINS,ISIN
0,112750,,83435,125797,21,,STDLONG,A/A-1,A/A-1,1999-01-07 14:45:53,...,,,,,,,2029-06-01,,,
1,112773,,83432,125794,21,,STDLONG,A+/A-1,A+/A-1,1999-01-07 14:45:53,...,,,,,,,2028-12-01,,,
2,113036,,83150,125466,21,,STDLONG,A/A-1,A/A-1,1998-12-31 14:45:58,...,,,,,,,2013-12-01,,,
3,113058,,83147,125463,21,,STDLONG,AAA/A-1+,AAA/A-1+,1998-12-31 14:45:58,...,,,,,,,2028-12-01,,,
4,113059,,83148,125464,21,,STDLONG,A/A-1,A/A-1,1998-12-31 14:45:58,...,,,,,,,2013-12-01,,,


In [109]:
uploadData(parsed_dataframe)