### Indexing dataset where indexing was forgotten

In [1]:
import gc
from tqdm.auto import tqdm
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pandas as pd
import os
# Set Arrow CPU count as in your helper
pa.set_cpu_count(8)

job_id='dj-synhub-extraction-lkbi9fy6zepu8rcjuxqhjwkbld52wgt0-ouutovygqc'
input_dir = f"../../extractions/{job_id}/input_articles_tiled"
output_dir = f"../../extractions/{job_id}/input_articles_tiled_indexed"
os.makedirs(output_dir, exist_ok=True)

dataset = ds.dataset(input_dir, format="parquet")
total_rows = 0
for batch in dataset.scanner().to_batches():
    total_rows += len(batch)
print(f"Total rows: {total_rows}")


Total rows: 3058995


In [None]:
# newer code - test next time
year_dirs = sorted([d for d in os.listdir(input_dir) if d.startswith('year=')])
current_index = 0

for year_dir in tqdm(year_dirs, desc="Processing years"):
    year_path = os.path.join(input_dir, year_dir)
    output_year_path = os.path.join(output_dir, year_dir)
    os.makedirs(output_year_path, exist_ok=True)
    
    month_dirs = sorted([d for d in os.listdir(year_path) if d.startswith('month=')])
    
    for month_dir in tqdm(month_dirs, desc=f"Processing months in {year_dir}", leave=False):
        month_path = os.path.join(year_path, month_dir)
        output_month_path = os.path.join(output_year_path, month_dir)
        os.makedirs(output_month_path, exist_ok=True)
        
        # Get all parquet files for this month
        parquet_files = sorted([f for f in os.listdir(month_path) if f.endswith('.parquet')])
        
        for file in tqdm(parquet_files, desc=f"Files in {month_dir}", leave=False):
            file_path = os.path.join(month_path, file)
            output_file_path = os.path.join(output_month_path, file)
            
            # Read file with pandas
            df = pd.read_parquet(file_path)
            
            # Add tile_id column
            df['tile_id'] = range(current_index, current_index + len(df))
            current_index += len(df)
            
            # Write the file directly to the output location with same name
            df.to_parquet(output_file_path, compression="zstd", index=False)
            
            # Clean up
            del df
            gc.collect()

print(f"Processed {current_index} rows, final index: {current_index}")
print(f"Output saved to: {output_dir}")

In [None]:
## older code - works
year_dirs = sorted([d for d in os.listdir(input_dir) if d.startswith('year=')])

current_index = 0

for year_dir in tqdm(year_dirs, desc="Processing years"):
    year_path = os.path.join(input_dir, year_dir)
    # Extract the year value from the directory name
    year_value = int(year_dir.split('=')[1])
    
    month_dirs = sorted([d for d in os.listdir(year_path) if d.startswith('month=')])
    
    for month_dir in tqdm(month_dirs, desc=f"Processing months in {year_dir}", leave=False):
        month_path = os.path.join(year_path, month_dir)
        # Extract the month value from the directory name
        month_value = int(month_dir.split('=')[1])
        
        # Get all parquet files for this month
        parquet_files = sorted([f for f in os.listdir(month_path) if f.endswith('.parquet')])
        
        for file in tqdm(parquet_files, desc=f"Files in {month_dir}", leave=False):
            file_path = os.path.join(month_path, file)
            
            # Read file with pandas
            df = pd.read_parquet(file_path)
            
            # Add year and month columns explicitly based on directory names
            df['year'] = year_value
            df['month'] = month_value
            
            # Add tile_id column
            df['tile_id'] = range(current_index, current_index + len(df))
            current_index += len(df)
            
            # Process in batches of 250,000 rows exactly as in your helper
            batch_size = 250_000
            for start in range(0, len(df), batch_size):
                batch_df = df.iloc[start:start + batch_size]
                
                # Convert to table
                tbl = pa.Table.from_pandas(batch_df, preserve_index=False)
                
                # Write using the same parameters as your helper
                pq.write_to_dataset(
                    tbl,
                    root_path=output_dir,
                    partition_cols=["year", "month"],
                    compression="zstd",
                    existing_data_behavior="overwrite_or_ignore"
                )
                
                # Clean up
                del tbl
                gc.collect()
            
            # Clean up the full dataframe
            del df
            gc.collect()



print(f"Processed {current_index} rows, final index: {current_index}")
print(f"Output saved to: {output_dir}")

#### Checking indexing result

In [7]:
import datetime as dt
import pandas as pd

filter_date = dt.datetime(2025,1,1)
tiled_df = pd.read_parquet(f"../../extractions/{job_id}/input_articles_tiled_indexed",  filters=[("year","=",filter_date.year)],columns=["tile_id","segments","pub_date"])
tiled_df

Unnamed: 0,tile_id,segments,pub_date
0,1910601,Ownership Submission\n\nFORM 4\n\nCheck this b...,2025-01-17
1,1910602,"\n\nWashington, D.C. 20549STATEMENT OF CHANGES...",2025-01-17
2,1910603,"\n\nDigitalBridge Group, Inc. [DBRG] 5. Relati...",2025-01-17
3,1910604,\n\n(Instr. 3) 2. Transaction Date (Month/Day/...,2025-01-17
4,1910605,\n\n(Instr. 8) 5. Number of Derivative Securit...,2025-01-17
...,...,...,...
1274951,2940041,FORM 4\n\nCheck this box if no longer subject ...,2025-05-05
1274952,2940042,"\n\nWashington, D.C. 20549 STATEMENT OF CHANGE...",2025-05-05
1274953,2940043,\n\nCONSUMERS BANCORP INC /OH/ [CBKM] 5. Relat...,2025-05-05
1274954,2940044,\n\n(Month/Day/Year) 7. Title and Amount of Un...,2025-05-05


In [None]:
assert tiled_df.tile_id.min() == 1784039# same as if add tile_id on the whole file online

### Merging with ssim dedup result

In [1]:
import pandas as pd

job_id='dj-synhub-extraction-lkbi9fy6zepu8rcjuxqhjwkbld52wgt0-ouutovygqc'
# df[df.tile_id>1784039] # tile_id for 2025-01-01
df = pd.read_parquet(f"../../extractions/{job_id}/input_articles_ssim.parquet")
df =df[df.tile_id>1784039][['segments','tile_id']] # tile_id for 2025-01-01
df

Unnamed: 0,segments,tile_id
478381,compared with 4Q23 Net Earnings of $116 millio...,1784043
478382,\n \n\nCapital ...,1784044
478383,"\n\nMr. Simmons continued, ""Net loan losses we...",1784045
478384,"\n\nMr. Simmons concluded, ""We're optimistic t...",1784046
478385,\n\n\n1 Comparisons noted in the bullet point...,1784047
...,...,...
821903,Interest will be paid semi-annually and the Of...,3058966
821904,\n\nThis press release may include projections...,3058971
821905,Credit Agricole S.A.\n\n02 May 2025\n\nFORM 8....,3058972
821906,"\n\n""It's the biggest single, buy-side deal in...",3058985


In [2]:
# cols we need for further processing
import datetime as dt
filter_date = dt.datetime(2025,1,1)

cols2use = ['source_name','title','company_codes','modification_date','region_codes','segments','num_segments','an', 'tile_id'] #post-processing and filtering will be later
tiled_df = pd.read_parquet(f"../../extractions/{job_id}/input_articles_tiled_indexed", columns=cols2use, filters=[("year","=",filter_date.year)])
tiled_df.shape

(1274956, 9)

## Filtering for region codes and preparing for run

In [3]:
region_codes = ['AUST', 'BELG', 'CZREP','DEN', 'ESTNIA','FIN', 'FRA','GFR', 'ITALY', 'LATV', 'IRE', 'ICEL', 'LITH', 'LUX', 'MALTA', 'NETH', 'NORW', 'POL', 'LARIO', 'SWED', 'SWITZ', 'UK', 'VIEN', 'BRUS', 'PRAGUE',
'COPEN', 'TALLIN','HELSNK', 'PARIS','BERLIN', 'ROME','RIGA', 'DUBLIN','REYK', 'VILNIU', 'LUXCI', 'VALLE', 'AMSTR', 'OSLO', 'WASW', 'MADRD', 'STOCK', 'BERN','DERRY']
region_codes += ["EURZ", "EEURZ", "WEURZ"]
region_codes = [el.lower() for el in region_codes]
region_codes[:5]

['aust', 'belg', 'czrep', 'den', 'estnia']

In [4]:
from tqdm.auto import tqdm
tqdm.pandas()

def should_keep(region_codes_list):
    # Check if any of the region codes in the list are in the exclude list
    return any(code in region_codes_list for code in region_codes)

tiled_df['region_codes'] = tiled_df['region_codes'].str.strip(',')
tiled_df['region_codes_list'] = tiled_df['region_codes'].progress_apply(lambda x: x.split(',') if isinstance(x, str) else [])
tiled_df['regions_relevant'] = tiled_df['region_codes_list'].progress_apply(lambda x: should_keep(x))

  0%|          | 0/1274956 [00:00<?, ?it/s]

  0%|          | 0/1274956 [00:00<?, ?it/s]

In [5]:
df2classify = tiled_df[tiled_df.tile_id.isin(df.tile_id) & tiled_df.regions_relevant].copy()
df2classify = df2classify.loc[ :, ~df2classify.columns.isin(['region_codes_list','regions_relevant'])]

df2classify

Unnamed: 0,source_name,title,company_codes,modification_date,region_codes,segments,num_segments,an,tile_id
19,Gulf Oil & Gas,Cleanova expands its product offering with the...,"alvmar,cmbndi,dbvpl,percoi",2025-01-10 07:13:57.825,"eurz,uk,weurz",Enhances Cleanova’s presence in the UK and bro...,4,GUONGA0020250109el170007b,1910620
20,Gulf Oil & Gas,Cleanova expands its product offering with the...,"alvmar,cmbndi,dbvpl,percoi",2025-01-10 07:13:57.825,"eurz,uk,weurz","\n\nCleanova, a clean tech, market-leading glo...",4,GUONGA0020250109el170007b,1910621
21,Gulf Oil & Gas,Cleanova expands its product offering with the...,"alvmar,cmbndi,dbvpl,percoi",2025-01-10 07:13:57.825,"eurz,uk,weurz","\n\nCleanova’s CEO, Javaid Riaz, commented, “T...",4,GUONGA0020250109el170007b,1910622
22,Gulf Oil & Gas,Cleanova expands its product offering with the...,"alvmar,cmbndi,dbvpl,percoi",2025-01-10 07:13:57.825,"eurz,uk,weurz",\n\nJavaid Riaz explained what the transition ...,4,GUONGA0020250109el170007b,1910623
145,thesun.co.uk,HIGH STREET GIANT WHSmith ‘in secret talks to ...,"bhchcm,brraic,bsbro,lsexch,smwh",2025-01-26 07:33:12.758,"eurz,uk,weurz",The retail group have been in negotiations wit...,7,THESUK0020250125el1p001jl,1910746
...,...,...,...,...,...,...,...,...,...
1274940,thetimes.co.uk,"Who could buy Bet365? Bankers salivate, but a ...","btsfgl,cnyc,cvccap,drftkn,gamvc,grdmp,habsaa,p...",2025-05-04 07:19:19.010,"eurz,uk,weurz",\n\nJP Morgan was completely unaware that Coat...,7,TIMEUK0020250503el53000jh,2940030
1274941,thetimes.co.uk,"Who could buy Bet365? Bankers salivate, but a ...","btsfgl,cnyc,cvccap,drftkn,gamvc,grdmp,habsaa,p...",2025-05-04 07:19:19.010,"eurz,uk,weurz",\n\nBet365 has historically had a sizeable rem...,7,TIMEUK0020250503el53000jh,2940031
1274942,thetimes.co.uk,"Who could buy Bet365? Bankers salivate, but a ...","btsfgl,cnyc,cvccap,drftkn,gamvc,grdmp,habsaa,p...",2025-05-04 07:19:19.010,"eurz,uk,weurz",\n\nBut a sale risks bringing Coates into conf...,7,TIMEUK0020250503el53000jh,2940032
1274943,thetimes.co.uk,"Who could buy Bet365? Bankers salivate, but a ...","btsfgl,cnyc,cvccap,drftkn,gamvc,grdmp,habsaa,p...",2025-05-04 07:19:19.010,"eurz,uk,weurz","\n\nAccordingly a sale may be her only option,...",7,TIMEUK0020250503el53000jh,2940033


In [None]:
import sys, os
sys.path.append('../../')
from dotenv import load_dotenv
from src.factiva_api.taxonomy import init_client, get_company_code_mapping
from src.mapping.company_mapper import extract_company_names

load_dotenv('../../.env')

user_key = os.getenv("FACTIVA_SNAPSHOTS_USER_KEY")

client = init_client(user_key)
companies_mapping = get_company_code_mapping(client)

df2classify['company_names'] = df2classify['company_codes'].progress_apply(lambda x: extract_company_names(x, companies_mapping))



Duplicate indices found in 'companies' taxonomy. Keeping first. Note: this behavior is expected


  0%|          | 0/135285 [00:00<?, ?it/s]

In [7]:
df2classify.to_csv(f'../../extractions/{job_id}/relevant_articles_to_classify_2025.csv', index=False)

### Looking for article links

In [2]:
import datetime as dt
import pandas as pd
job_id='dj-synhub-extraction-lkbi9fy6zepu8rcjuxqhjwkbld52wgt0-ouutovygqc'

filter_date = dt.datetime(2025,1,1)
tiled_df = pd.read_parquet(f"../../extractions/{job_id}/input_articles_tiled_indexed", columns=None, filters=[("year","=",filter_date.year), ("month","=",filter_date.month)])

tiled_df

Unnamed: 0,source_name,title,snippet,body,section,word_count,source_code,industry_codes,company_codes,subject_codes,...,modification_date,region_codes,an,action,pub_date,segments,num_segments,tile_id,year,month
0,Securities and Exchange Commission (SEC) Filings,Digitalbridge Group Inc. - Statement of Change...,Access the original document here\n\nStatement...,Ownership Submission\n\nFORM 4\n\nCheck this b...,,857,SAEXC,",i831,ifinal,iinv,",nsassz,",c18,c181,c41,cactio,ccat,cdirdl,cgvfil,cintrf...",...,2025-01-18 07:24:27.686,",namz,usa,",SAEXC00020250117el1h00o06,add,2025-01-17,Ownership Submission\n\nFORM 4\n\nCheck this b...,6,1910601,2025,1
1,Securities and Exchange Commission (SEC) Filings,Digitalbridge Group Inc. - Statement of Change...,Access the original document here\n\nStatement...,Ownership Submission\n\nFORM 4\n\nCheck this b...,,857,SAEXC,",i831,ifinal,iinv,",nsassz,",c18,c181,c41,cactio,ccat,cdirdl,cgvfil,cintrf...",...,2025-01-18 07:24:27.686,",namz,usa,",SAEXC00020250117el1h00o06,add,2025-01-17,"\n\nWashington, D.C. 20549STATEMENT OF CHANGES...",6,1910602,2025,1
2,Securities and Exchange Commission (SEC) Filings,Digitalbridge Group Inc. - Statement of Change...,Access the original document here\n\nStatement...,Ownership Submission\n\nFORM 4\n\nCheck this b...,,857,SAEXC,",i831,ifinal,iinv,",nsassz,",c18,c181,c41,cactio,ccat,cdirdl,cgvfil,cintrf...",...,2025-01-18 07:24:27.686,",namz,usa,",SAEXC00020250117el1h00o06,add,2025-01-17,"\n\nDigitalBridge Group, Inc. [DBRG] 5. Relati...",6,1910603,2025,1
3,Securities and Exchange Commission (SEC) Filings,Digitalbridge Group Inc. - Statement of Change...,Access the original document here\n\nStatement...,Ownership Submission\n\nFORM 4\n\nCheck this b...,,857,SAEXC,",i831,ifinal,iinv,",nsassz,",c18,c181,c41,cactio,ccat,cdirdl,cgvfil,cintrf...",...,2025-01-18 07:24:27.686,",namz,usa,",SAEXC00020250117el1h00o06,add,2025-01-17,\n\n(Instr. 3) 2. Transaction Date (Month/Day/...,6,1910604,2025,1
4,Securities and Exchange Commission (SEC) Filings,Digitalbridge Group Inc. - Statement of Change...,Access the original document here\n\nStatement...,Ownership Submission\n\nFORM 4\n\nCheck this b...,,857,SAEXC,",i831,ifinal,iinv,",nsassz,",c18,c181,c41,cactio,ccat,cdirdl,cgvfil,cintrf...",...,2025-01-18 07:24:27.686,",namz,usa,",SAEXC00020250117el1h00o06,add,2025-01-17,\n\n(Instr. 8) 5. Number of Derivative Securit...,6,1910605,2025,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243873,Securities and Exchange Commission (SEC) Filings,JPMorgan Chase & Co. - Primary Offering Prospe...,Access the original document here\n\nPrimary O...,"January 8, 2025\n\nRegistration Statement Nos....",,8513,SAEXC,",i814,i81402,ibnk,ifinal,iibnk,","blfima,cnyc,davpw,deptct,deptr,fdic,irevs,jpms...",",c15,c151,c1512,cactio,ccat,cgvfil,ncat,nfact,...",...,2025-01-11 07:29:43.324,",namz,usa,",SAEXC00020250110el1a00dff,add,2025-01-10,"\n\nNotes"" in this pricing supplement.\n\nSeco...",62,1991388,2025,1
243874,Securities and Exchange Commission (SEC) Filings,JPMorgan Chase & Co. - Primary Offering Prospe...,Access the original document here\n\nPrimary O...,"January 8, 2025\n\nRegistration Statement Nos....",,8513,SAEXC,",i814,i81402,ibnk,ifinal,iibnk,","blfima,cnyc,davpw,deptct,deptr,fdic,irevs,jpms...",",c15,c151,c1512,cactio,ccat,cgvfil,ncat,nfact,...",...,2025-01-11 07:29:43.324,",namz,usa,",SAEXC00020250110el1a00dff,add,2025-01-10,\n\nincluded in the original issue price of th...,62,1991389,2025,1
243875,Securities and Exchange Commission (SEC) Filings,JPMorgan Chase & Co. - Primary Offering Prospe...,Access the original document here\n\nPrimary O...,"January 8, 2025\n\nRegistration Statement Nos....",,8513,SAEXC,",i814,i81402,ibnk,ifinal,iibnk,","blfima,cnyc,davpw,deptct,deptr,fdic,irevs,jpms...",",c15,c151,c1512,cactio,ccat,cgvfil,ncat,nfact,...",...,2025-01-11 07:29:43.324,",namz,usa,",SAEXC00020250110el1a00dff,add,2025-01-10,\n\nReflected on Customer Account Statements) ...,62,1991390,2025,1
243876,Securities and Exchange Commission (SEC) Filings,JPMorgan Chase & Co. - Primary Offering Prospe...,Access the original document here\n\nPrimary O...,"January 8, 2025\n\nRegistration Statement Nos....",,8513,SAEXC,",i814,i81402,ibnk,ifinal,iibnk,","blfima,cnyc,davpw,deptct,deptr,fdic,irevs,jpms...",",c15,c151,c1512,cactio,ccat,cgvfil,ncat,nfact,...",...,2025-01-11 07:29:43.324,",namz,usa,",SAEXC00020250110el1a00dff,add,2025-01-10,"\n\nnotes. See ""How the Notes Work"" and ""Hypot...",62,1991391,2025,1


In [4]:
df = pd.read_csv(f"../../extractions/{job_id}/result.csv")
df

Unnamed: 0,source_name,dateline,ingestion_datetime,currency_codes,company_codes_association_ticker_exchange,title,snippet,company_codes_lineage_ticker_exchange,an,company_codes_occur_ticker_exchange,...,document_type,modification_datetime,company_codes,action,region_codes,market_index_codes,company_codes_about_ticker_exchange,company_codes_occur,section,company_codes_relevance_ticker_exchange
0,Dow Jones Institutional News,,1739355044000,,,Press Release: XPENG announces its official la...,"\n -- XPENG officially enters the UK, streng...",,DJDN000020250212el2c001sa,,...,article,1739355044000,",bp,hkexch,hkexch,hsflia,hsflia,imotol,imotol,...",add,",eurz,uk,weurz,",",xdjgic,xdjglc,xdjiic,xnyci,",,",myjjcc,imotol,hsflia,hkexch,",,",MMTOF:PSGM,7211:XTKS,MMTOY:PSGM,MMO:XFRA,"
1,Securities and Exchange Commission (SEC) Filings,,1743191927000,,,First Busey Corporation - Statement of Changes...,Access the original document here\n\nStatement...,,SAEXC00020250328el3s00cin,,...,article,1743191927000,",firbus,firbus,firbus,seexc,seexc,",add,",namz,usa,",,,",seexc,firbus,",,
2,PR Newswire,,1744722047000,,,PLAUD.AI Acquires YC-Backed StarJar to Power I...,"SAN FRANCISCO, April 15, 2025 /PRNewswire/ -- ...",,PRN0000020250415el4f000cs,",INTU:XWBO,1INTU:XMIL,INTU:XNAS,INTU:XMEX,ITU:...",...,article,1744722047000,",amzcom,amzcom,gognew,gognew,ituit,ituit,linkd...",add,",namz,sfra,usa,usca,usw,",,,",yoinco,teslmi,pkxwks,linkd,ituit,gognew,amzcom,",,",INTU:XWBO,1INTU:XMIL,INTU:XNAS,INTU:XMEX,ITU:..."
3,Public Companies News and Documents via PUBT,,1731288771000,,,Shionogi & Co. Ltd. - SHIONOGI Story 2 :Busine...,Access the original document here\n\nSHIONOGI ...,,LCDVP00020241111ekbb0020f,",SGIOY:PSGM,SGIOF:PSGM,SH0:XFRA,SH00:XMUN,4507...",...,article,1731288771000,",pingin,pingin,shimc,shimc,shnogi,shnogi,shnog...",add,",apacz,asiaz,china,chinaz,devgcoz,dvpcoz,easia...",,",SGIOY:PSGM,SGIOF:PSGM,SH0:XFRA,SH00:XMUN,4507...",",soneti,shnogi,shimc,pingin,",,",SGIOY:PSGM,SGIOF:PSGM,SH0:XFRA,SH00:XMUN,4507..."
4,Canada Stockwatch,,1736895739000,,,Torex Gold Provides 2025 Operational Guidance ...,(All amounts expressed in U.S. dollars unless ...,,CNSW000020250114el1e00ifx,,...,article,1736895739000,",hydgld,hydgld,hydgld,",add,",cana,caon,namz,toron,",,,",hydgld,",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223516,Securities and Exchange Commission (SEC) Filings,,1741210800000,,,Coinbase Global Inc. - Statement of Changes in...,Access the original document here\n\nStatement...,,SAEXC00020250305el3500s1l,,...,article,1741210800000,",coinba,coinba,coinba,seexc,seexc,",add,",namz,usa,",,,",seexc,coinba,",,
223517,Public Companies News and Documents via PUBT,,1742297491000,,,Almawave S.p.A. - Almawave and Oracle: strateg...,Access the original document here\n\nAlmawave ...,,LCDVP00020250318el3i00kwv,,...,article,1742297491000,",hggngf,hggngf,kosco,kosco,orcle,orcle,orcle,p...",add,",eecz,eurz,italy,lombar,medz,milan,weurz,",",xf500,",,",prital,orcle,kosco,hggngf,",,
223518,Global Banking News,,1727705061000,,,QNB’s share buyback gets regulatory approval,Qatar-based banking firm QNB Group has said th...,,GLOBAN0020240930ek9u000b9,,...,article,1727705061000,",qatfma,qatfma,qatfma,qbnk,qbnk,qbnk,qma,qma,q...",add,,,,",qnbusa,qma,qbnk,qatfma,",,
223519,Securities and Exchange Commission (SEC) Filings,,1715978722000,,,FNCB Bancorp Inc. - Amendment to Statement of ...,Access the original document here\n\nAmendment...,,SAEXC00020240517ek5h00mt5,,...,article,1715978722000,",fstdun,fstdun,fstdun,",add,",namz,usa,",,,",fstdun,",,
