In [47]:
# Imports

import pandas as pd
import time
import random
import os
from pytrends.request import TrendReq


## Category 1: Loughran-McDonald Dictionary Filtering

In [None]:
# Read the LM dictionary CSV file into a DataFrame
df = pd.read_csv('Loughran-McDonald_MasterDictionary_1993-2024.csv')

df

df_filtered = df[['Word', 'Negative', 'Positive', 'Uncertainty']]

# Keep only rows where at least one category is nonzero
df_filtered = df_filtered[(df_filtered[['Negative', 'Positive', 'Uncertainty']] != 0).any(axis=1)]

print(len(df_filtered))
df_filtered

print("Negative:", (df_filtered['Negative'] != 0).sum())
print("Positive:", (df_filtered['Positive'] != 0).sum())
print("Uncertainty:", (df_filtered['Uncertainty'] != 0).sum())

2966
Negative: 2355
Positive: 354
Uncertainty: 297


In [None]:
# Filter by taking 1 year and removing anything that has 70%+ data =0
# Approximate timing 2-2.5 hours: approx 6 min for 100 keywords

# Settings
TIMEFRAME   = "2023-01-01 2023-12-31"
GEO         = "US"
ZERO_THRESH = 0.70
DELAY       = 15  # increased from 10
BATCH_SIZE  = 5
CHECKPOINT  = "lm_survivors_checkpoint.csv"

# Prep keyword list
keywords = df_filtered['Word'].str.lower().str.strip().unique().tolist()
print(f"Total keywords to test: {len(keywords)}")

batches = [keywords[i:i+BATCH_SIZE] for i in range(0, len(keywords), BATCH_SIZE)]
print(f"Total batches: {len(batches)}")

# ── Resume from checkpoint if it exists ──────────────────────────────────────
if os.path.exists(CHECKPOINT):
    checkpoint_df = pd.read_csv(CHECKPOINT)
    survivors = {row['keyword']: {"zero_pct": row['zero_pct'], "mean_interest": row['mean_interest']} 
                 for _, row in checkpoint_df.iterrows()}
    already_done = set(survivors.keys())
    print(f"Resuming from checkpoint — {len(already_done)} keywords already done")
else:
    survivors = {}
    already_done = set()
    print("Starting fresh")

dropped = []
pytrends = TrendReq(hl="en-US", tz=0, timeout=(10, 30))

for idx, batch in enumerate(batches, 1):
    # Skip batches already completed
    if all(kw in already_done for kw in batch):
        print(f"[{idx}/{len(batches)}] Skipping (already done)")
        continue

    print(f"[{idx}/{len(batches)}] Fetching: {batch}")
    
    success = False
    for attempt in range(1, 4):
        try:
            pytrends.build_payload(batch, timeframe=TIMEFRAME, geo=GEO)
            raw = pytrends.interest_over_time()
            success = True
            break
        except Exception as exc:
            wait = DELAY * (2 ** (attempt - 1)) + random.uniform(0, 3)
            print(f"  Attempt {attempt}/3 failed — {exc}. Retrying in {wait:.1f}s")
            time.sleep(wait)
    
    if not success or raw.empty:
        print(f"  No data — dropping batch")
        dropped.extend(batch)
        continue

    raw = raw.drop(columns=["isPartial"], errors="ignore")

    for kw in batch:
        if kw not in raw.columns:
            dropped.append(kw)
            continue
        series = raw[kw]
        zero_pct = (series == 0).sum() / len(series)
        if zero_pct >= ZERO_THRESH:
            dropped.append(kw)
        else:
            survivors[kw] = {"zero_pct": round(zero_pct, 3), "mean_interest": round(series.mean(), 2)}

    # Save checkpoint every 25 batches
    if idx % 25 == 0:
        pd.DataFrame([{"keyword": kw, **v} for kw, v in survivors.items()]).to_csv(CHECKPOINT, index=False)
        print(f"  ✓ checkpoint saved at batch {idx}")

    if idx < len(batches):
        sleep_time = DELAY + random.uniform(0, DELAY * 0.4)
        time.sleep(sleep_time)

# ── Final save ────────────────────────────────────────────────────────────────
survivors_df = pd.DataFrame([{"keyword": kw, **v} for kw, v in survivors.items()])
survivors_df = survivors_df.sort_values("mean_interest", ascending=False).reset_index(drop=True)
survivors_df.to_csv(CHECKPOINT, index=False)
survivors_df.to_csv("lm_survivors.csv", index=False)

print(f"\nStarted with : {len(keywords)}")
print(f"Survived     : {len(survivors_df)}")
print(f"Dropped      : {len(dropped)}")
survivors_df.head(20)

Total keywords to test: 1833
Total batches: 367
Starting fresh
[1/367] Fetching: ['stock market crash', 'market crash', 'recession', 'bear market', 'financial crisis']


KeyboardInterrupt: 

In [25]:
# Filter 1: zero_pct < 0.70 (already done — this is survivors_df)
# Filter 2: mean_interest >= 20
survivors_filtered = survivors_df[survivors_df['mean_interest'] >= 20].reset_index(drop=True)

print(f"After zero filter:     {len(survivors_df)}")
print(f"After interest filter: {len(survivors_filtered)}")
print(f"Dropped by filter 2:   {len(survivors_df) - len(survivors_filtered)}")
survivors_filtered

After zero filter:     2387
After interest filter: 1035
Dropped by filter 2:   1352


Unnamed: 0,keyword,zero_pct,mean_interest
0,corrections,0.0,94.28
1,detention,0.0,94.06
2,rewards,0.0,92.62
3,dangerous,0.0,91.96
4,enable,0.0,91.72
...,...,...,...
1030,sentencing,0.0,20.28
1031,interference,0.0,20.21
1032,reward,0.0,20.19
1033,disagree,0.0,20.15


In [26]:
# Finance context filter
# Re-scrape survivors_filtered with "stock market" as anchor in every batch

ANCHOR = "stock market"
keywords_to_retest = survivors_filtered['keyword'].tolist()
batches2 = [keywords_to_retest[i:i+4] for i in range(0, len(keywords_to_retest), 4)]  # 4 per batch to leave room for anchor

print(f"Re-testing {len(keywords_to_retest)} keywords with finance context anchor")
print(f"Total batches: {len(batches2)}")

finance_survivors = {}

for idx, batch in enumerate(batches2, 1):
    batch_with_anchor = batch + [ANCHOR]
    print(f"[{idx}/{len(batches2)}] Fetching: {batch}")
    
    success = False
    for attempt in range(1, 4):
        try:
            pytrends.build_payload(batch_with_anchor, timeframe=TIMEFRAME, geo=GEO)
            raw = pytrends.interest_over_time()
            success = True
            break
        except Exception as exc:
            wait = 15 * (2 ** (attempt - 1)) + random.uniform(0, 3)
            print(f"  Attempt {attempt}/3 failed — {exc}. Retrying in {wait:.1f}s")
            time.sleep(wait)
    
    if not success or raw.empty:
        print(f"  No data — skipping batch")
        continue

    raw = raw.drop(columns=["isPartial"], errors="ignore")

    for kw in batch:
        if kw not in raw.columns:
            continue
        series = raw[kw]
        zero_pct = (series == 0).sum() / len(series)
        if zero_pct >= 0.70:
            print(f"  DROPPED {kw} (not finance-relevant)")
        else:
            finance_survivors[kw] = {
                "zero_pct": round(zero_pct, 3),
                "mean_interest": round(series.mean(), 2)
            }
            print(f"  KEPT {kw} (mean={series.mean():.1f})")

    if idx < len(batches2):
        sleep_time = 15 + random.uniform(0, 6)
        time.sleep(sleep_time)

# Results
finance_survivors_df = pd.DataFrame([{"keyword": kw, **v} for kw, v in finance_survivors.items()])
finance_survivors_df = finance_survivors_df.sort_values("mean_interest", ascending=False).reset_index(drop=True)
finance_survivors_df.to_csv("lm_finance_survivors.csv", index=False)

print(f"\nBefore finance filter: {len(keywords_to_retest)}")
print(f"After finance filter:  {len(finance_survivors_df)}")
finance_survivors_df.head(20)

Re-testing 1035 keywords with finance context anchor
Total batches: 259
[1/259] Fetching: ['corrections', 'detention', 'rewards', 'dangerous']
  KEPT corrections (mean=18.0)
  KEPT detention (mean=17.5)
  KEPT rewards (mean=89.6)
  KEPT dangerous (mean=71.0)
[2/259] Fetching: ['enable', 'annoying', 'worst', 'accidentally']
  KEPT enable (mean=35.4)
  KEPT annoying (mean=11.6)
  KEPT worst (mean=71.5)
  KEPT accidentally (mean=17.9)
[3/259] Fetching: ['superior', 'bad', 'unable', 'termination']
  KEPT superior (mean=8.3)
  KEPT bad (mean=92.2)
  KEPT unable (mean=4.1)
  KEPT termination (mean=1.1)
[4/259] Fetching: ['lying', 'cancel', 'annulment', 'great']
  KEPT lying (mean=5.0)
  KEPT cancel (mean=21.2)
  DROPPED annulment (not finance-relevant)
  KEPT great (mean=89.1)
[5/259] Fetching: ['claims', 'attractive', 'deficiency', 'weak']
  KEPT claims (mean=52.1)
  KEPT attractive (mean=10.6)
  KEPT deficiency (mean=29.7)
  KEPT weak (mean=34.9)
[6/259] Fetching: ['cut', 'abuse', 'strain'

Unnamed: 0,keyword,zero_pct,mean_interest
0,broken,0.0,92.74
1,bad,0.0,92.23
2,rewards,0.0,89.58
3,error,0.0,89.57
4,great,0.0,89.09
5,cut,0.0,88.15
6,problems,0.0,86.62
7,loss,0.0,86.11
8,questions,0.0,85.75
9,strong,0.0,84.91


In [None]:
# Filter again for mean <= 20
finance_survivors_filtered = finance_survivors_df[finance_survivors_df['mean_interest'] >= 20].reset_index(drop=True)

print(f"After zero filter:     {len(finance_survivors_df)}")
print(f"After interest filter: {len(finance_survivors_filtered)}")
print(f"Dropped by filter 4:   {len(finance_survivors_df) - len(finance_survivors_filtered)}")
finance_survivors_filtered

After zero filter:     884
After interest filter: 174
Dropped by filter 4:   710


Unnamed: 0,keyword,zero_pct,mean_interest
0,broken,0.0,92.74
1,bad,0.0,92.23
2,rewards,0.0,89.58
3,error,0.0,89.57
4,great,0.0,89.09
...,...,...,...
169,discontinued,0.0,20.83
170,breach,0.0,20.75
171,intermittent,0.0,20.60
172,deviation,0.0,20.53


In [75]:
finance_survivors_filtered.to_csv('lm_dict_survivors.csv', index=False)
print(f"Saved {len(finance_survivors_filtered)} words to lm_dict_survivors.csv")

Saved 174 words to lm_dict_survivors.csv


## Category 2, 3, 4: LLM-assisted Generation

In [60]:
# Analysis on LLM-assisted Generation--more to come

file_path = "Project/Google Search Trends Data.xlsx"
llm_cat2 = pd.read_excel(file_path, sheet_name='Category 2')
llm_cat2 = llm_cat2.iloc[1:].reset_index(drop=True)
llm_cat2.columns = ['ChatGPT', 'Claude', 'Gemini']

llm_cat2


Unnamed: 0,ChatGPT,Claude,Gemini
0,stock market crash,stock crash,Market crash
1,market crash,market crash,Bear market
2,recession,stock sell-off,Stock volatility
3,bear market,bear market,Margin call
4,financial crisis,margin call,Panic selling
...,...,...,...
145,inflation spike,yield curve,Capital flight
146,hyperinflation,credit crunch,Asset devaluation
147,stagflation,short selling,Stagflation
148,housing crash,stock liquidation,Quantitative easing


In [61]:
# Combine all keywords

file_path = "Project/Google Search Trends Data.xlsx"
xl = pd.ExcelFile(file_path)

all_words = []

for sheet in xl.sheet_names:
    df = pd.read_excel(file_path, sheet_name=sheet, header=None)
    
    # Skip first row (LLM names), rename columns
    df = df.iloc[2:].reset_index(drop=True)
    df.columns = ['ChatGPT', 'Claude', 'Gemini']
    
    # Melt into one column and tag with sheet name
    melted = pd.melt(df, var_name='source', value_name='keyword')
    melted['category'] = sheet
    all_words.append(melted)

# Combine all sheets
combined = pd.concat(all_words, ignore_index=True)
combined

Unnamed: 0,source,keyword,category
0,ChatGPT,stock market crash,Category 2
1,ChatGPT,market crash,Category 2
2,ChatGPT,recession,Category 2
3,ChatGPT,bear market,Category 2
4,ChatGPT,financial crisis,Category 2
...,...,...,...
5995,Gemini,Build to rent,Category 4 Real Estate
5996,Gemini,Suburban migration,Category 4 Real Estate
5997,Gemini,Affordable housing,Category 4 Real Estate
5998,Gemini,Distressed property,Category 4 Real Estate


In [62]:
combined['keyword'] = combined['keyword'].dropna().str.strip().str.lower()
combined = combined.dropna(subset=['keyword'])

print(f"Total raw entries: {len(combined)}")

Total raw entries: 6000


In [63]:
combined_deduped = combined.drop_duplicates(subset='keyword').reset_index(drop=True)

print(f"After deduplication: {len(combined_deduped)}")
combined_deduped

After deduplication: 1811


Unnamed: 0,source,keyword,category
0,ChatGPT,stock market crash,Category 2
1,ChatGPT,market crash,Category 2
2,ChatGPT,recession,Category 2
3,ChatGPT,bear market,Category 2
4,ChatGPT,financial crisis,Category 2
...,...,...,...
1806,Gemini,green buildings,Category 4 Real Estate
1807,Gemini,urban renewal,Category 4 Real Estate
1808,Gemini,suburban migration,Category 4 Real Estate
1809,Gemini,affordable housing,Category 4 Real Estate


### Category 2: Adding FEARS index

In [64]:
fears_words = [
    "gold prices", "recession", "gold price", "depression", "great depression",
    "gold", "economy", "price of gold", "the depression", "crisis",
    "frugal", "gdp", "charity", "bankruptcy", "unemployment",
    "inflation rate", "bankrupt", "the great depression", "car donate",
    "capitalization", "expense", "donation", "savings", "social security card",
    "the crisis", "default", "benefits", "unemployed", "poverty",
    "social security office"
]

fears_df = pd.DataFrame({
    'keyword': fears_words,
    'source': 'FEARS',
    'category': 'Category 2'
})

# Check which are already in combined_deduped
already_in = fears_df['keyword'].isin(combined_deduped['keyword'])
print(f"Already in dataset: {already_in.sum()}")
print(f"Missing, will add: {(~already_in).sum()}")
print("\nMissing words:")
print(fears_df[~already_in]['keyword'].tolist())

# Add missing ones
missing_fears = fears_df[~already_in]
combined_deduped = pd.concat([combined_deduped, missing_fears], ignore_index=True)
print(f"\nNew total: {len(combined_deduped)}")

Already in dataset: 8
Missing, will add: 22

Missing words:
['depression', 'great depression', 'economy', 'price of gold', 'the depression', 'crisis', 'frugal', 'charity', 'bankrupt', 'the great depression', 'car donate', 'capitalization', 'expense', 'donation', 'savings', 'social security card', 'the crisis', 'default', 'benefits', 'unemployed', 'poverty', 'social security office']

New total: 1833


In [65]:
import re

# Normalize whitespace — collapse multiple spaces, strip edges
combined_deduped['keyword'] = combined_deduped['keyword'].str.replace(r'\s+', ' ', regex=True).str.strip()

# Deduplicate again after normalization
combined_deduped = combined_deduped.drop_duplicates(subset='keyword').reset_index(drop=True)

print(f"After whitespace normalization: {len(combined_deduped)}")

# Remove all spaces and check for duplicates
combined_deduped['keyword_nospace'] = combined_deduped['keyword'].str.replace(' ', '')
dupes = combined_deduped[combined_deduped.duplicated(subset='keyword_nospace', keep=False)]
print(dupes.sort_values('keyword_nospace')[['keyword', 'keyword_nospace']])


After whitespace normalization: 1833
                keyword   keyword_nospace
12        bank collapse      bankcollapse
61         bankcollapse      bankcollapse
11         bank failure       bankfailure
60          bankfailure       bankfailure
78       banking crisis     bankingcrisis
59        bankingcrisis     bankingcrisis
3           bear market        bearmarket
51           bearmarket        bearmarket
71          bonddefault       bonddefault
83         bond default       bonddefault
105      capital flight     capitalflight
74        capitalflight     capitalflight
8         credit crisis      creditcrisis
56         creditcrisis      creditcrisis
1271      cybersecurity     cybersecurity
1392     cyber security     cybersecurity
58           debtcrisis        debtcrisis
9           debt crisis        debtcrisis
70          defaultrisk       defaultrisk
128        default risk       defaultrisk
291          exxonmobil        exxonmobil
305         exxon mobil        exxonmob

In [66]:
# For duplicates, keep the one with spaces (drop the no-space version)
combined_deduped['has_no_space'] = combined_deduped['keyword'] == combined_deduped['keyword_nospace']

# Sort so spaced version comes first, then drop duplicates keeping first
combined_deduped = combined_deduped.sort_values('has_no_space').drop_duplicates(subset='keyword_nospace', keep='first')

# Clean up helper columns
combined_deduped = combined_deduped.drop(columns=['keyword_nospace', 'has_no_space']).reset_index(drop=True)

print(f"After removing no-space duplicates: {len(combined_deduped)}")

After removing no-space duplicates: 1804


In [69]:
combined_deduped

Unnamed: 0,source,keyword,category
0,ChatGPT,stock market crash,Category 2
1,ChatGPT,property casualty,Category 4 Financials
2,ChatGPT,life insurance,Category 4 Financials
3,ChatGPT,commercial banking,Category 4 Financials
4,ChatGPT,money center banks,Category 4 Financials
...,...,...,...
1799,Gemini,warehousing,Category 4 Industrials
1800,ChatGPT,stimulus,Category 3
1801,ChatGPT,railroads,Category 4 Industrials
1802,Gemini,outsourcing,Category 4 Industrials


In [72]:
# Filter 1

TIMEFRAME   = "2023-01-01 2023-12-31"
GEO         = "US"
ZERO_THRESH = 0.90
DELAY       = 15
BATCH_SIZE  = 5
CHECKPOINT  = "llm_survivors_checkpoint.csv"

# Prep keyword list
keywords = combined_deduped['keyword'].str.lower().str.strip().unique().tolist()
print(f"Total keywords to test: {len(keywords)}")

batches = [keywords[i:i+BATCH_SIZE] for i in range(0, len(keywords), BATCH_SIZE)]
print(f"Total batches: {len(batches)}")

# Resume from checkpoint if it exists
if os.path.exists(CHECKPOINT):
    checkpoint_df = pd.read_csv(CHECKPOINT)
    survivors = {row['keyword']: {"zero_pct": row['zero_pct'], "mean_interest": row['mean_interest']} 
                 for _, row in checkpoint_df.iterrows()}
    already_done = set(survivors.keys())
    print(f"Resuming from checkpoint — {len(already_done)} keywords already done")
else:
    survivors = {}
    already_done = set()
    print("Starting fresh")

dropped = []
pytrends = TrendReq(hl="en-US", tz=0, timeout=(10, 30))

for idx, batch in enumerate(batches, 1):
    if all(kw in already_done for kw in batch):
        print(f"[{idx}/{len(batches)}] Skipping (already done)")
        continue

    print(f"[{idx}/{len(batches)}] Fetching: {batch}")
    
    success = False
    for attempt in range(1, 4):
        try:
            pytrends.build_payload(batch, timeframe=TIMEFRAME, geo=GEO)
            raw = pytrends.interest_over_time()
            success = True
            break
        except Exception as exc:
            wait = DELAY * (2 ** (attempt - 1)) + random.uniform(0, 3)
            print(f"  Attempt {attempt}/3 failed — {exc}. Retrying in {wait:.1f}s")
            time.sleep(wait)
    
    if not success or raw.empty:
        print(f"  No data — dropping batch")
        dropped.extend(batch)
        continue

    raw = raw.drop(columns=["isPartial"], errors="ignore")

    for kw in batch:
        if kw not in raw.columns:
            dropped.append(kw)
            continue
        series = raw[kw]
        zero_pct = (series == 0).sum() / len(series)
        if zero_pct >= ZERO_THRESH:
            dropped.append(kw)
        else:
            survivors[kw] = {"zero_pct": round(zero_pct, 3), "mean_interest": round(series.mean(), 2)}

    if idx % 25 == 0:
        pd.DataFrame([{"keyword": kw, **v} for kw, v in survivors.items()]).to_csv(CHECKPOINT, index=False)
        print(f"  ✓ checkpoint saved at batch {idx}")

    if idx < len(batches):
        sleep_time = DELAY + random.uniform(0, DELAY * 0.4)
        time.sleep(sleep_time)

# Final save
survivors_df = pd.DataFrame([{"keyword": kw, **v} for kw, v in survivors.items()])
survivors_df = survivors_df.sort_values("mean_interest", ascending=False).reset_index(drop=True)
survivors_df.to_csv(CHECKPOINT, index=False)
survivors_df.to_csv("llm_survivors.csv", index=False)

print(f"\nStarted with : {len(keywords)}")
print(f"Survived     : {len(survivors_df)}")
print(f"Dropped      : {len(dropped)}")
survivors_df.head(20)

Total keywords to test: 1804
Total batches: 361
Resuming from checkpoint — 896 keywords already done
[1/361] Fetching: ['stock market crash', 'property casualty', 'life insurance', 'commercial banking', 'money center banks']
[2/361] Skipping (already done)
[3/361] Skipping (already done)
[4/361] Skipping (already done)
[5/361] Fetching: ['payment processors', 'digital banking', 'fintech stocks', 'ipo market', 'online banking']
[6/361] Fetching: ['mortgage reits', 'exchange stocks', 'fed rate', 'bank merger', 'crypto bank']
[7/361] Skipping (already done)
[8/361] Fetching: ['wall street', 'bank crisis', 'credit card', 'financial stocks', 'goldman sachs']
[9/361] Skipping (already done)
[10/361] Skipping (already done)
[11/361] Fetching: ['private equity', 'wells fargo', 'wealth management', 'consumer lending', 'health services']
[12/361] Skipping (already done)
[13/361] Fetching: ['health it', 'drug patents', 'medical cannabis', 'health sector', 'medical equipment']
[14/361] Fetching: [

Unnamed: 0,keyword,zero_pct,mean_interest
0,home care,0.0,92.87
1,oil,0.0,92.85
2,medical supplies,0.0,92.3
3,diagnostics,0.0,92.0
4,steel,0.0,91.83
5,insurance,0.0,91.02
6,real estate investment,0.0,90.32
7,heavy equipment,0.0,89.89
8,podcast,0.0,89.64
9,credit cards,0.0,89.6


In [73]:
survivors_with_category = survivors_df.merge(
    combined_deduped[['keyword', 'category']], 
    on='keyword', 
    how='left'
)

print(survivors_with_category['category'].value_counts())
survivors_with_category.head(20)

category
Category 4 Communication Servic    138
Category 4 Materials               135
Category 4 Industrials             126
Category 4 Energy                  125
Category 2                         115
Category 4 Real Estate             113
Category 4 Financials              111
Category 4 Information Technolo    111
Category 4 Health Care             106
Category 4 Consumer Staples        105
Category 3                          97
Category 4 Consumer Discretiona     90
Category 4 Utilities                78
Name: count, dtype: int64


Unnamed: 0,keyword,zero_pct,mean_interest,category
0,home care,0.0,92.87,Category 4 Consumer Staples
1,oil,0.0,92.85,Category 4 Energy
2,medical supplies,0.0,92.3,Category 4 Health Care
3,diagnostics,0.0,92.0,Category 4 Health Care
4,steel,0.0,91.83,Category 4 Materials
5,insurance,0.0,91.02,Category 4 Financials
6,real estate investment,0.0,90.32,Category 4 Real Estate
7,heavy equipment,0.0,89.89,Category 4 Industrials
8,podcast,0.0,89.64,Category 4 Communication Servic
9,credit cards,0.0,89.6,Category 4 Financials


In [77]:
import pandas as pd

lm = pd.read_csv('lm_dict_survivors.csv')[['keyword']]
llm = pd.read_csv('llm_survivors.csv')[['keyword']]

print(f"LM survivors: {len(lm)}")
print(f"LLM survivors: {len(llm)}")

# Combine and deduplicate
final = pd.concat([lm, llm]).drop_duplicates(subset='keyword').reset_index(drop=True)

print(f"Total after deduplication: {len(final)}")

final.to_csv('final_keywords.csv', index=False)
print("Saved to final_keywords.csv")
final

LM survivors: 174
LLM survivors: 1450
Total after deduplication: 1620
Saved to final_keywords.csv


Unnamed: 0,keyword
0,broken
1,bad
2,rewards
3,error
4,great
...,...
1615,defense contractor
1616,property vacancies
1617,office reit
1618,currency devaluation


In [78]:
import os
os.path.exists('final_keywords.csv')

True