In [2]:
# 📦 Imports
import pandas as pd
import spacy
import torch
from transformers import pipeline
from tqdm import tqdm

# ✅ Load data
df = pd.read_csv(r"C:\Users\sagni\Downloads\Supply Chain Disraption\abcnews-date-text.csv")
df.rename(columns={'headline_text': 'text'}, inplace=True)
df = df.sample(1000, random_state=42).reset_index(drop=True)  # Sample for speed

# 🧠 Load NLP models
nlp = spacy.load("en_core_web_sm")
sentiment_pipeline = pipeline("sentiment-analysis")

# ⚙️ Named Entity Recognition (GPE=location, ORG=company, EVENT=disaster)
def extract_ner(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ['GPE', 'ORG', 'EVENT']]

# 📉 Sentiment
def get_sentiment(text):
    try:
        result = sentiment_pipeline(text[:512])[0]  # Trim text to 512 tokens
        return result['label'], result['score']
    except:
        return None, None

# ⚠️ Disruption keyword detection
disruption_keywords = ['strike', 'earthquake', 'flood', 'shutdown', 'explosion', 'delay', 'fire', 'lockdown']
def detect_disruption(text):
    return any(keyword in text.lower() for keyword in disruption_keywords)

# 🚀 Run everything
tqdm.pandas()
df['entities'] = df['text'].progress_apply(extract_ner)
df[['sentiment', 'sentiment_score']] = df['text'].progress_apply(lambda x: pd.Series(get_sentiment(x)))
df['is_disruption'] = df['text'].apply(detect_disruption)

# 🎯 Final output
df[['text', 'entities', 'sentiment', 'sentiment_score', 'is_disruption']].head(10)


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development





Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:08<00:00, 121.86it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:23<00:00, 42.48it/s]


Unnamed: 0,text,entities,sentiment,sentiment_score,is_disruption
0,virtual reality trial ahead of fire season in ...,[],POSITIVE,0.995874,True
1,farmers prepare for ec funding,"[(ec, ORG)]",POSITIVE,0.801793,False
2,the sunday inquisition august 10,[],NEGATIVE,0.504771,False
3,news csg reax,[],POSITIVE,0.9787,False
4,rosetta spacecraft on final approach to comet ...,[],POSITIVE,0.973712,False
5,milne's lawyer wants access to police notes,[],NEGATIVE,0.998131,False
6,needle found in mandarin amid sa fruit contami...,[],NEGATIVE,0.98596,False
7,nrn prawn plan,[],NEGATIVE,0.985375,False
8,tiger woods dominates presidents cup day three,[],POSITIVE,0.999425,False
9,how long does it take to lose fitness,[],NEGATIVE,0.997564,False


In [3]:
# 🛠️ Convert date column to datetime format
df['publish_date'] = pd.to_datetime(df['publish_date'], format='%Y%m%d')

# 📌 Separate entities by type
def split_entities(entities):
    orgs, gpes, events = [], [], []
    for ent, label in entities:
        if label == 'ORG':
            orgs.append(ent)
        elif label == 'GPE':
            gpes.append(ent)
        elif label == 'EVENT':
            events.append(ent)
    return pd.Series([orgs, gpes, events])

df[['ORGs', 'Locations', 'Events']] = df['entities'].apply(split_entities)

# 🧹 Keep only disruption-relevant entries
disruption_df = df[df['is_disruption']].copy()

# 🗃️ Select relevant columns
disruption_df = disruption_df[[
    'publish_date', 'text', 'ORGs', 'Locations', 'Events',
    'sentiment', 'sentiment_score'
]]

# 💾 Save for mapping/time-series
disruption_df.to_csv("disruption_news_summary.csv", index=False)

# 🖼️ Preview
disruption_df.head(10)


Unnamed: 0,publish_date,text,ORGs,Locations,Events,sentiment,sentiment_score
0,2018-10-17,virtual reality trial ahead of fire season in ...,[],[],[],POSITIVE,0.995874
19,2017-04-28,amateur video of three carbridge buses on fire at,[],[],[],NEGATIVE,0.999565
55,2009-05-06,us reporter ends iran jail hunger strike,[],"[us, iran]",[],NEGATIVE,0.944044
108,2005-09-27,funding woes create goulburn hospital fire haz...,[],[],[],NEGATIVE,0.998536
146,2004-05-07,postal workers to strike over eba,[],[],[],NEGATIVE,0.995851
187,2017-07-27,ohio amusement park ride fire ball breaks apart,[],[ohio],[],NEGATIVE,0.979555
189,2009-11-18,gooseponds flood report findings loom,[],[],[],NEGATIVE,0.97143
208,2019-10-19,firestorm: shaken faith in fire management ser...,[],[],[],POSITIVE,0.998188
220,2013-01-31,floodwaters cut roads in maitland district,[],[],[],POSITIVE,0.577718
221,2008-10-23,teachers threaten strike over licence renewal,[],[],[],NEGATIVE,0.995716
