<a href="https://colab.research.google.com/github/sadikaVER/xml_parser/blob/main/SteelEye_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download Packages
>- boto3


In [None]:
!pip install boto3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install s3fs

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Mount Drive

In [None]:
from google.colab import drive
drive.mount("/content/drive/")

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
path="/content/drive/My Drive/dataset/Assignment/internshala/SteelEye/"

# Import Packages
> * pandas : For creating csv files
> * xml.etree.ElementTree : To parse XML
> * os : for file path
> * shutil: for unzipping files  
> * boto3: for aws s3 bucket operations
> * logging: for logging 

In [None]:
import pandas as pd 
import numpy as np
import xml.etree.ElementTree as ET
import os
import shutil
import boto3
import logging as log
import urllib
import pprint
import warnings
warnings.filterwarnings("ignore")

# Parse the given source xml file and store its value in .csv file 

In [None]:
def parse_source_file(path,filename):
  '''
     Parses the source xml file and create the parsed result in structured format.
     Param(s):
        filename (str): xml filename
        path     (str): source path   
  '''
  try:
    logg.info("Loading Source xml file ...")
    tree=ET.parse(path+filename)
    root=tree.getroot()
    logg.info("Parsing Source xml file ...")
    check,link,pub_date,rt,pubinsfl,id,filename,file_type,version,timestamp=[],[],[],[],[],[],[],[],[],[]
    logg.info("Extracting information within doc tag ...")
    for child in root.iter():
      for key,value in child.attrib.items():
        if value =="checksum":
          check.append(child.text)
        if value=='download_link':
          link.append(child.text)
        if value=="publication_date":
          pub_date.append(child.text)
        if value== "_root_":
          rt.append(child.text)
        if value=="published_instrument_file_id":
          pubinsfl.append(child.text)
        if value=="id":
          id.append(child.text)
        if value=="file_name":
          filename.append(child.text)
        if value=="file_type":
          file_type.append(child.text)
        if value=="_version_":
          version.append(child.text)
        if value=="timestamp":
          timestamp.append(child.text)

    logg.info("Storing the extracted information from Source xml file to demo_xml_parsed file in tabular format with .csv extension ...")
    df=pd.DataFrame()
    df["Cheksum"]=check
    df["Link"]=link
    df["Publication_date"]=pub_date
    df["Root"]=rt
    df["Publication_inst_file"]=pubinsfl
    df["ID"]=id
    df["Filename"]=filename
    df["File_type"]=file_type
    df["Version"]=version
    df["Timestamp"]=timestamp

    logg.info("Saving parsed  file to google drive ...")
    df.to_csv(path+"demo_xml_parsed.csv",sep=",",index=False)  
  except Exception as e:
    logg.error(f"Error : {str(e)}")


# Extract and save the files to drive whose file type is DLTINS from the links given in parsed file.
    

In [None]:
def extract_files_from_link(path):
  '''
  1. Extract the zip files from links and save them to google drive
  2. unzip the saved files.

  Params: 
    path  (str): file path
  '''
  try :
    # load the parsed demo_xml_parsed.csv file
    df=pd.read_csv(path+"demo_xml_parsed.csv",sep=",")
    # select only those file whose type is DLTINS	
    df=df[df["File_type"]=="DLTINS"]
    i=0
    logg.info("Extract the files from link...")
    for url in df["Link"].tolist():
      file_name = path+'myzip'+str(i)+'.zip'
      logg.info("Saving file  and unzipping: "+url.split("/")[-1].split(".")[0]+" to drive")
      with urllib.request.urlopen(url) as response, open(file_name, 'wb') as out_file:
        shutil.copyfileobj(response, out_file)
      shutil.unpack_archive(path+'myzip'+str(i)+'.zip', path+"data_file")   
      i+=1  
  except Exception as e :
         logg.error(f"Error : {str(e)}") 



# Parse the above extracted DLTINS xml files and  store the following information into .csv file .
>- ID
>- FullNm
>- ClssfctnTp
>- CmmdtyDerivInd
>- NtnlCcy
>- Issr

In [None]:
def xml_data(path):
  '''
  Extract data from above stated attributes and store the results in tabular format 
  as .csv file

  Params:
  path (str): file path (for storing and loading)
  '''
  try:
    # extract all the files that startswith DLTINS
    files=[f for f in os.listdir(path+"data_file") if f.startswith("DLTINS")]
  
    df=pd.DataFrame()
    id,nm,cls,cmd,nt,issr=[],[],[],[],[],[]
    for fl in files:
      tree=ET.parse(path+"data_file/"+fl)
      root=tree.getroot()
      for child in root.iter():
        if child.tag=="{urn:iso:std:iso:20022:tech:xsd:auth.036.001.02}Id":
          if len(child.text)>4:
            id.append(child.text)
        if child.tag=="{urn:iso:std:iso:20022:tech:xsd:auth.036.001.02}FullNm":
          nm.append(child.text)
        if child.tag=="{urn:iso:std:iso:20022:tech:xsd:auth.036.001.02}ClssfctnTp":
          cls.append(child.text)
        if child.tag=="{urn:iso:std:iso:20022:tech:xsd:auth.036.001.02}CmmdtyDerivInd":
          cmd.append(child.text)
        if child.tag=="{urn:iso:std:iso:20022:tech:xsd:auth.036.001.02}NtnlCcy":
          nt.append(child.text)
        if child.tag=="{urn:iso:std:iso:20022:tech:xsd:auth.036.001.02}Issr":
          issr.append(child.text)
    
    df["ID"]=id
    df["FullName"]=nm
    df["ClssfctnTp"]=cls
    df["CmmdtyDerivlnd"]=cmd
    df["NtnlCcy"]=nt
    df["Issr"]=issr
    df.to_csv(path+"final_file.csv",sep=",",index=False)

  except Exception as e:
    logg.error(f"Error : {str(e)}") 

# Upload final csv file to aws s3


In [None]:
def aws_file_upload(path,bucket_name,filename):
  '''
  Upload the final file to AWS S3 bucket

  Params:

  path (str) : file path
  region_name (str): Region name of s3 bucket is hosted
  access_key  (str): AWS Access key id
  secret_key  (str): AWS secret key
  bucket_name  (str): name of bucket where to store final file
  filename     (str): name of the file its shoud be on bucket

  returns :
    True : if loads sucessfully   
  '''

  s3 = boto3.client('s3')
  s3=boto3.resource(
                  service_name='s3',
                 
                  )
  
   
  logg.info("Uploading the final file to s3 Bucket")  
  # upload file to s3 bucket
  s3.Bucket(bucket_name).upload_file(Filename=path+filename,Key=filename)
    
    
  
  

# Main Function

#  Note :
## ⚛ For aws credentials you have to create user , so that can load and access file in AWS S3 and for this [ click link ](https://www.youtube.com/watch?v=JKlOlDFwsao)

In [None]:
os.environ["AWS_DEFAULT_REGION"] = 'us-east-1'
os.environ["AWS_ACCESS_KEY_ID"] = "################"
os.environ["AWS_SECRET_ACCESS_KEY"] = "#######################"

In [None]:
if __name__=="__main__":
  log.basicConfig()

  log.root.setLevel(log.NOTSET)

  log.basicConfig(level=log.NOTSET)

  handle = "xml_parser"
  logger1 = log.getLogger(handle)
  
  logg = log.getLogger(__name__)

  source_filename="demo.xml"


  parse_source_file(path,source_filename)
  logg.info("Step:1 Parse Source file completed ")


  extract_files_from_link(path)
  logg.info("Step:2 Extract , save and unzip xml file from link completed")


  xml_data(path)
  logg.info("Step:3 Parse Extracted files and convert into .csv file completed")


 
  # enter the aws credentials
  region_name='us-east-1'
  bucket_name='xml2data'
  filename="final_file.csv"

  aws_file_upload(path,bucket_name,filename)
 
  logg.info("Step:4 Upload to aws s3 Bucket completed")
  


INFO:__main__:Loading Source xml file ...
INFO:__main__:Parsing Source xml file ...
INFO:__main__:Extracting information within doc tag ...
INFO:__main__:Storing the extracted information from Source xml file to demo_xml_parsed file in tabular format with .csv extension ...
INFO:__main__:Saving parsed  file to google drive ...
INFO:__main__:Step:1 Parse Source file completed 
INFO:__main__:Extract the files from link...
INFO:__main__:Saving file  and unzipping: DLTINS_20210117_01of01 to drive
INFO:__main__:Saving file  and unzipping: DLTINS_20210119_01of02 to drive
INFO:__main__:Saving file  and unzipping: DLTINS_20210119_02of02 to drive
INFO:__main__:Saving file  and unzipping: DLTINS_20210118_01of01 to drive
INFO:__main__:Step:2 Extract , save and unzip xml file from link completed
INFO:__main__:Step:3 Parse Extracted files and convert into .csv file completed
INFO:__main__:Step:4 Upload to aws s3 Bucket completed
