In [1]:
import pandas as pd
from datetime import datetime
import pickle

In [60]:
class DataExtractor:
    def __init__(self, invoices, expired_invoices):
        self.invoices = invoices
        self.expired_invoices = expired_invoices
        self.df = None
        self.invoices_data = None
        self.expired_invoices_data = None
        
        
    def loading_data(self):
        with open(self.invoices, 'rb') as file:
            self.invoices_data = pickle.load(file)
        with open(self.expired_invoices, 'r') as file:
            expired_invoices_content = file.read().strip() 
            self.expired_invoices_data = set(map(int, expired_invoices_content.split(', ')))
    
    def transform_data(self):
        x = []
        type_conversion = {0: 'Material', 1: 'Equipment', 2: 'Service', 3: 'Other'}
        
        for i in self.invoices_data:
            invoice_id = i.get('id')  
        
            if isinstance(invoice_id, str) and invoice_id[-1] == 'o':
                invoice_id = invoice_id[:-1] + '0' 

        # Convert modified invoice id to int
            try:
                invoice_id_int = int(invoice_id)
            except ValueError:
                print("Invalid invoice id:", invoice_id)
                continue  
            
        
        created_on = datetime.strptime(i.get('created_on'), '%Y-%m-%d')
        items = i.get('items')
        invoice_total = 0 
       
        for item in items:
            try:
                quantity = int(item['quantity']) 
                unit_price = int(item['item']['unit_price'])  
                invoice_total += quantity * unit_price  
                
                # Process each item
                invoiceitem_id = int(item['item']['id']) 
                invoiceitem_name = item['item']['name']
                type_id = int(item['item']['type'])  
                invoiceitem_type = type_conversion.get(type_id, 'Unknown')
                total_price = unit_price * quantity
                percentage_in_invoice = total_price / invoice_total
                is_expired = invoice_id_int in self.expired_invoices_data
                
                x.append({
                    'invoice_id': invoice_id,
                    'created_on': created_on,
                    'invoiceitem_id': invoiceitem_id,
                    'invoiceitem_name': invoiceitem_name,
                    'type': invoiceitem_type,
                    'unit_price': unit_price,
                    'total_price': total_price,
                    'percentage_in_invoice': percentage_in_invoice,
                    'is_expired': is_expired
                })
            except ValueError:
                print("Invalid data for item:", item)
    
        self.df = pd.DataFrame(x)
        self.df = self.df.astype({'invoice_id': int, 'invoiceitem_id': int, 'type': str, 'unit_price': int, 'total_price': int, 'percentage_in_invoice': float, 'is_expired': bool})
        self.df = self.df.sort_values(by=['invoice_id', 'invoiceitem_id']).reset_index(drop=True)
        
    def get_dataframe(self):
        return self.df
    
    def save_to_csv(self, file):
        if self.df is not None:
            self.df.to_csv(file, index=False)
        else:
            print("DataFrame is empty :(")

        
    

In [61]:
data_extractor = DataExtractor( r"C:\Users\Ruzan\Downloads\data\invoices_new.pkl", r"C:\Users\Ruzan\Downloads\data\expired_invoices.txt")
data_extractor.loading_data()
data_extractor.transform_data()
final_df = data_extractor.get_dataframe()
data_extractor.save_to_csv("invoices_data.csv")

final_df

Invalid invoice id: 365371O
Invalid invoice id: 374089O
Invalid invoice id: 397723O
Invalid invoice id: 325156O
Invalid invoice id: 326649O
Invalid invoice id: 385290O
Invalid invoice id: 381476O
Invalid invoice id: 335226O
Invalid invoice id: 371786O
Invalid invoice id: 331902O
Invalid invoice id: 308237O
Invalid invoice id: 340299O
Invalid invoice id: 373375O


Unnamed: 0,invoice_id,created_on,invoiceitem_id,invoiceitem_name,type,unit_price,total_price,percentage_in_invoice,is_expired
0,377960,2019-02-22,123242,ii_123242,Equipment,110,550,1.0,True
1,377960,2019-02-22,144902,ii_144902,Service,188,1316,0.39413,True
2,377960,2019-02-22,196386,ii_196386,Other,150,900,0.62069,True
3,377960,2019-02-22,196707,ii_196707,Service,191,573,0.283243,True
