In [2]:
import pandas as pd

# -------------------------------------------------
# Correct paths based on your processed folder
# -------------------------------------------------
datasets = {
    "FIQA": "data/processed/fiqa_spacy.csv",
    "PHRASEBANK": "data/processed/phrasebank_spacy.csv",
    "YAHOO": "data/processed/yahoo_spacy.csv",
    "SEC_NUM": "data/processed/sec_num_processed.csv",
    "SEC_SUB": "data/processed/sec_sub_processed.csv",
    "SEC_TAG": "data/processed/sec_tag_processed.csv"
}

# -------------------------------------------------
# EDA loop
# -------------------------------------------------
for name, path in datasets.items():
    print("\n" + "="*60)
    print(f"DATASET: {name}")
    print("="*60)

    try:
        df = pd.read_csv(path)

        # Basic info
        print("Shape:", df.shape)
        print("\nColumns:")
        print(df.columns.tolist())

        # Preview
        print("\nFirst 5 rows:")
        display(df.head())

        # Missing values
        print("\nMissing values (top 10):")
        print(df.isna().sum().sort_values(ascending=False).head(10))

        # Text length stats (if text exists)
        if "clean_text" in df.columns:
            df["text_len"] = df["clean_text"].astype(str).apply(len)
            print("\nText length stats:")
            print(df["text_len"].describe())

    except Exception as e:
        print(f"‚ùå Could not load {name}: {e}")



DATASET: FIQA
Shape: (14511, 6)

Columns:
['question', 'answer', 'text', 'clean_text', 'tokens', 'lemmas']

First 5 rows:


Unnamed: 0,question,answer,text,clean_text,tokens,lemmas
0,What do brokers do with bad stock?,"For every seller, there's a buyer. Buyers may ...",What do brokers do with bad stock? For every s...,what do brokers do with bad stock for every se...,what do brokers do with bad stock for every se...,what do broker do with bad stock for every sel...
1,Why do investors buy stock that had appreciated?,"You seem to prefer to trade like I do: ""Buy lo...",Why do investors buy stock that had appreciate...,why do investors buy stock that had appreciate...,why do investors buy stock that had appreciate...,why do investor buy stock that have appreciate...
2,Is it ever a good idea to close credit cards?,"Yes, it can be a good idea to close unused cre...",Is it ever a good idea to close credit cards? ...,is it ever a good idea to close credit cards y...,is it ever a good idea to close credit cards y...,be it ever a good idea to close credit card ye...
3,If something is coming into my account will it...,The bank will make this even more confusing be...,If something is coming into my account will it...,if something is coming into my account will it...,if something is coming into my account will it...,if something be come into my account will it b...
4,What one bit of financial advice do you wish y...,When I was contracting I wish I had joined a t...,What one bit of financial advice do you wish y...,what one bit of financial advice do you wish y...,what one bit of financial advice do you wish y...,what one bit of financial advice do you wish y...



Missing values (top 10):
question      0
answer        0
text          0
clean_text    0
tokens        0
lemmas        0
dtype: int64

Text length stats:
count    14511.000000
mean      1076.670181
std        902.390537
min         40.000000
25%        498.000000
50%        830.000000
75%       1366.000000
max      16871.000000
Name: text_len, dtype: float64

DATASET: PHRASEBANK
Shape: (4846, 5)

Columns:
['label', 'text', 'clean_text', 'tokens', 'lemmas']

First 5 rows:


Unnamed: 0,label,text,clean_text,tokens,lemmas
0,neutral,"According to Gran , the company has no plans t...",According to Gran the company has no plans to ...,According to Gran the company has no plans to ...,accord to Gran the company have no plan to mov...
1,neutral,Technopolis plans to develop in stages an area...,Technopolis plans to develop in stages an area...,Technopolis plans to develop in stages an area...,technopoli plan to develop in stage an area of...
2,negative,The international electronic industry company ...,The international electronic industry company ...,The international electronic industry company ...,the international electronic industry company ...
3,positive,With the new production plant the company woul...,With the new production plant the company woul...,With the new production plant the company woul...,with the new production plant the company woul...
4,positive,According to the company 's updated strategy f...,According to the company s updated strategy fo...,According to the company s updated strategy fo...,accord to the company s update strategy for th...



Missing values (top 10):
label         0
text          0
clean_text    0
tokens        0
lemmas        0
dtype: int64

Text length stats:
count    4846.000000
mean      124.836773
std        54.437599
min         7.000000
25%        83.000000
50%       117.000000
75%       160.000000
max       300.000000
Name: text_len, dtype: float64

DATASET: YAHOO
Shape: (1260, 10)

Columns:
['Price', 'Close', 'High', 'Low', 'Open', 'Volume', 'text', 'clean_text', 'tokens', 'lemmas']

First 5 rows:


Unnamed: 0,Price,Close,High,Low,Open,Volume,text,clean_text,tokens,lemmas
0,Ticker,AAPL,AAPL,AAPL,AAPL,AAPL,Open AAPL High AAPL Low AAPL Close AAPL,Open AAPL High AAPL Low AAPL Close AAPL,Open AAPL High AAPL Low AAPL Close AAPL,open AAPL High AAPL Low AAPL Close AAPL
1,Date,,,,,,Open nan High nan Low nan Close nan,Open nan High nan Low nan Close nan,Open nan High nan Low nan Close nan,open nan High nan Low nan Close nan
2,2019-01-02,37.538818359375,37.759888517261295,36.66167592649953,36.81856395762742,148158800,Open 36.81856395762742 High 37.759888517261295...,Open 36.81856395762742 High 37.759888517261295...,Open 36.81856395762742 High 37.759888517261295...,open 36.81856395762742 High 37.759888517261295...
3,2019-01-03,33.79966735839844,34.63877547055248,33.75450230314456,34.22516265768987,365248800,Open 34.22516265768987 High 34.63877547055248 ...,Open 34.22516265768987 High 34.63877547055248 ...,Open 34.22516265768987 High 34.63877547055248 ...,open 34.22516265768987 High 34.63877547055248 ...
4,2019-01-04,35.242557525634766,35.31149481961843,34.18238275669409,34.35590843708306,234428400,Open 34.35590843708306 High 35.31149481961843 ...,Open 34.35590843708306 High 35.31149481961843 ...,Open 34.35590843708306 High 35.31149481961843 ...,open 34.35590843708306 High 35.31149481961843 ...



Missing values (top 10):
Close         1
High          1
Low           1
Open          1
Volume        1
Price         0
text          0
clean_text    0
tokens        0
lemmas        0
dtype: int64

Text length stats:
count    1260.000000
mean       92.830159
std         2.802990
min        35.000000
25%        92.000000
50%        93.000000
75%        94.000000
max        95.000000
Name: text_len, dtype: float64

DATASET: SEC_NUM
Shape: (3409931, 2)

Columns:
['text', 'tokens']

First 5 rows:


Unnamed: 0,text,tokens
0,adsh tag version ddate qtrs uom segments coreg...,adsh tag version ddate qtrs uom segments coreg...
1,0000002488-25-000047 AccountsPayableCurrent us...,0000002488 - 25 - 000047 AccountsPayableCurren...
2,0000002488-25-000047 AssetsCurrent us-gaap/202...,0000002488 - 25 - 000047 AssetsCurrent us - ga...
3,0000002488-25-000047 CashAndCashEquivalentsAtC...,0000002488 - 25 - 000047 CashAndCashEquivalent...
4,0000002488-25-000047 CommitmentsAndContingenci...,0000002488 - 25 - 000047 CommitmentsAndConting...



Missing values (top 10):
text      0
tokens    0
dtype: int64

DATASET: SEC_SUB
Shape: (213, 4)

Columns:
['text', 'tokens', 'lemmas', 'pos']

First 5 rows:


Unnamed: 0,text,tokens,lemmas,pos
0,adsh cik name sic countryba stprba cityba zipb...,adsh cik name sic countryba stprba cityba zipb...,adsh cik name sic countryba stprba cityba zipb...,PROPN PROPN PROPN PROPN PROPN PROPN PROPN PROP...
1,"CA IRVINE 92614 EXECUTIVE CIRCLE, SUITE 100 US...","CA IRVINE 92614 EXECUTIVE CIRCLE , SUITE 100 U...","CA IRVINE 92614 EXECUTIVE CIRCLE , SUITE 100 U...",PROPN PROPN NUM PROPN PROPN PUNCT PROPN NUM PR...
2,"DALLAS 75225 5956 SHERRY LANE, SUITE 1400 214-...","DALLAS 75225 5956 SHERRY LANE , SUITE 1400 214...","DALLAS 75225 5956 SHERRY LANE , SUITE 1400 214...",PROPN NUM NUM PROPN PROPN PUNCT PROPN NUM NUM ...
3,MAIN STREETS P O BOX 391 8048432360 US VA WEST...,MAIN STREETS P O BOX 391 8048432360 US VA WEST...,main STREETS p o box 391 8048432360 US VA WEST...,ADJ PROPN NOUN NOUN NOUN NUM NUM PROPN PROPN P...
4,6000 WESTOWN PARKWAY US IA 421447959 AMERICAN ...,6000 WESTOWN PARKWAY US IA 421447959 AMERICAN ...,6000 WESTOWN PARKWAY US IA 421447959 AMERICAN ...,NUM PROPN PROPN PROPN PROPN NUM PROPN PROPN PR...



Missing values (top 10):
text      0
tokens    0
lemmas    0
pos       0
dtype: int64

DATASET: SEC_TAG
Shape: (89427, 2)

Columns:
['text', 'tokens']

First 5 rows:


Unnamed: 0,text,tokens
0,tag version custom abstract datatype iord crdr...,tag version custom abstract datatype iord crdr...
1,OperatingLeaseLiabilityInitialRecognitionOther...,OperatingLeaseLiabilityInitialRecognitionOther...
2,ProceedFromDisposalOfPropertyPlantAndEquipment...,ProceedFromDisposalOfPropertyPlantAndEquipment...
3,NoncashDistributionsOfDigitalAssetsToFulfillCu...,NoncashDistributionsOfDigitalAssetsToFulfillCu...
4,InventoryWorkInProcessAndRawMaterials us-gaap/...,InventoryWorkInProcessAndRawMaterials us - gaa...



Missing values (top 10):
text      0
tokens    0
dtype: int64
