In [None]:
# IMPORTANT:
# This code requires the WRDS-hosted Jupyter environment to run.
# It will NOT work on a local machine because it relies on WRDS internal paths and direct filesystem
# links: https://wrds-jupyter.wharton.upenn.edu/
# the only file you need to provide is "practicelist.csv" which contains a column "path" with SEC public paths.

import pandas as pd # type: ignore[import-untyped]
import os
import wrds # type: ignore[import-not-found]

# 1. Connect
db = wrds.Connection()


# 2. Load your public SEC paths
df = pd.read_csv("practicelist.csv")   # contains column: path

print("Mapping SEC public paths → WRDS internal paths...\n")


# 3. Query ALL mappings at once
public_paths_tuple = tuple(df["path"].unique())

query = f"""
    SELECT fname, wrdsfname
    FROM wrdssec.wrds_forms
    WHERE fname IN {public_paths_tuple}
"""

mapping = db.raw_sql(query)
df = df.merge(mapping, how="left", left_on="path", right_on="fname")



# Preview the top 200 rows. It's just for display.
# The script will still process every row in the dataset.

print("Found matches:")
print(df.head(200)) 


# 4. Extract the raw text using WRDS internal paths
all_docs = []

print("\nExtracting text from WRDS filesystem...\n")

for _, row in df.iterrows():
    wrdsfname = row["wrdsfname"]

    if pd.isna(wrdsfname):
        print("NO MATCH:", row["path"])
        continue 

    file_path = f"/wrds/sec/warchives/{wrdsfname}"

    if not os.path.exists(file_path):
        print("MISSING:", file_path)
        continue

    try:
        with open(file_path, "r", errors="ignore") as f:
            text = f.read()

        all_docs.append(text)
        print("OK:", file_path)

    except Exception as e:
        print("FAILED:", file_path, e)

        
# 5. Save big text file
print("\nSaving all text to wrds_text_data.txt...")

with open("wrds_text_data.txt", "w", errors="ignore") as out:
    for text in all_docs:
        out.write(text)
        out.write("\n\n" + "="*80 + "\n\n")

print("Done! File saved as wrds_text_data.txt")


Loading library list...
Done
Mapping SEC public paths → WRDS internal paths...

Found matches:
                                        path  \
0   edgar/data/2488/0001193125-13-001042.txt   
1   edgar/data/2488/0001193125-13-019330.txt   
2   edgar/data/2488/0001193125-13-019330.txt   
3   edgar/data/2488/0001193125-13-019330.txt   
4   edgar/data/2488/0001193125-13-052006.txt   
..                                       ...   
95  edgar/data/3545/0000891092-13-000599.txt   
96  edgar/data/3545/0000891092-13-000732.txt   
97  edgar/data/3545/0000891092-13-001075.txt   
98  edgar/data/3545/0000891092-13-001598.txt   
99  edgar/data/3545/0000891092-13-002256.txt   

                                       fname  \
0   edgar/data/2488/0001193125-13-001042.txt   
1   edgar/data/2488/0001193125-13-019330.txt   
2   edgar/data/2488/0001193125-13-019330.txt   
3   edgar/data/2488/0001193125-13-019330.txt   
4   edgar/data/2488/0001193125-13-052006.txt   
..                                      