In [None]:
import os
import ast
import torch
import argparse
import evaluate
import numpy as np
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from accelerate import Accelerator
from torch.utils.data import DataLoader
from seqeval.metrics import classification_report
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, get_scheduler, DataCollatorForTokenClassification

In [None]:
# import requests

# # Replace with your actual API key

# https://www.opensanctions.org/entities/NK-jFf3vbLSJD5sviAomgsney/
# API_KEY = "your_api_key_here"
# url = "https://api.opensanctions.org/v1/persons/"  # Example endpoint

# headers = {
#     "Authorization": f"Bearer {API_KEY}",
#     "Content-Type": "application/json"
# }

# # Optional: add query parameters if needed
# params = {
#     "name": "John Doe"  # Example: Searching for a person by name
# }

# response = requests.get(url, headers=headers, params=params)

# if response.status_code == 200:
#     data = response.json()  # Get the JSON response
#     print(data)
# else:
#     print(f"Error: {response.status_code} - {response.text}")


In [None]:
# Data generation

In [None]:

!pip install datasets
!pip install transformers
!pip install accelerate -U
!pip install evaluate
!pip install seqeval
!pip install requests

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=82d84ca128d4cf5f732739ab3978a7bf09a0cec04d35f4cf31d7e3432ae71403
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packa

Generating Negative News: The function generate_negative_news() creates random negative news examples based on predefined templates.
NER Extraction: The perform_ner() function processes each news article using spaCy’s NER capabilities and extracts named entities.

In [None]:
!pip install spacy




In [None]:
import random
# Function to generate negative news examples
def generate_negative_news():
    news_templates = [
        ("{organization} Faces {adjective} Lawsuit Over {incident}", ["company", "company", "incident"]),
        ("{person} Resigns Amid {scandal}", ["person", "scandal"]),
        ("{person} Faces Charges After {crime} Scheme", ["person", "crime"]),
        ("{organization} Under Investigation for {incident} After {event}", ["organization", "incident", "event"]),
        ("{organization} Faces {legal_issue} Over Discriminatory {practice}", ["organization", "legal_issue", "practice"]),
    ]

    organizations = ["XYZ Corp", "ABC Enterprises", "DEF Bank", "Global Tech Inc.", "ABC Airlines"]
    persons = ["John Smith", "Jane Doe", "Mike Johnson", "Sarah Connor"]
    incidents = ["data breach", "financial scandal", "safety violations", "corruption charges"]
    crimes = ["kickback", "fraudulent", "money laundering"]
    scandals = ["financial scandal", "illegal funding", "bribery"]
    legal_issues = ["class-action lawsuit", "legal scrutiny", "corruption investigation"]
    practices = ["lending practices", "hiring practices", "environmental standards"]
    adjectives = ["record-breaking", "massive", "unprecedented", "severe"]
    events = ["deadly crash", "collapse of stock", "public protest", "failing inspection"]

    # Randomly select words for the template
    template, entities = random.choice(news_templates)
    selected_entities = {
        "organization": random.choice(organizations),
        "person": random.choice(persons),
        "incident": random.choice(incidents),
        "crime": random.choice(crimes),
        "scandal": random.choice(scandals),
        "legal_issue": random.choice(legal_issues),
        "practice": random.choice(practices),
        "adjective": random.choice(adjectives),
        "event": random.choice(events),
    }

    # Fill the template with random words
    news = template.format(**selected_entities)

    return news

# Example usage:
for _ in range(5):  # Generate 5 negative news articles
    print(generate_negative_news())
    print("\n" + "-"*50 + "\n")

Global Tech Inc. Under Investigation for financial scandal After deadly crash

--------------------------------------------------

XYZ Corp Under Investigation for corruption charges After failing inspection

--------------------------------------------------

XYZ Corp Under Investigation for safety violations After public protest

--------------------------------------------------

XYZ Corp Faces severe Lawsuit Over corruption charges

--------------------------------------------------

ABC Airlines Faces legal scrutiny Over Discriminatory lending practices

--------------------------------------------------



In [None]:
import spacy

# Load the pre-trained spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to perform NER
def perform_ner(text):
    doc = nlp(text)
    entities = [(entity.text, entity.label_) for entity in doc.ents]
    return entities

# Example usage:
for _ in range(5):  # Generate 5 negative news articles
    news_article = generate_negative_news()
    print(f"News Article: {news_article}")

    # Get entities from the news article
    entities = perform_ner(news_article)
    print("Named Entities:", entities)
    print("\n" + "-"*50 + "\n")


News Article: Sarah Connor Faces Charges After kickback Scheme
Named Entities: [('Sarah Connor Faces Charges', 'ORG')]

--------------------------------------------------

News Article: Sarah Connor Faces Charges After fraudulent Scheme
Named Entities: [('Sarah Connor Faces Charges', 'ORG')]

--------------------------------------------------

News Article: Global Tech Inc. Faces corruption investigation Over Discriminatory hiring practices
Named Entities: [('Global Tech Inc. Faces', 'ORG')]

--------------------------------------------------

News Article: DEF Bank Under Investigation for corruption charges After public protest
Named Entities: [('DEF Bank Under Investigation', 'ORG')]

--------------------------------------------------

News Article: DEF Bank Faces corruption investigation Over Discriminatory environmental standards
Named Entities: [('DEF Bank Faces', 'ORG')]

--------------------------------------------------



In [None]:
!pip install transformers torch


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
# from transformers import LlamaForCausalLM, LlamaTokenizer
# import torch

# # Step 1: Load the model and tokenizer
# model_name = "unsloth/Meta-Llama-3.1-8B-Instruct"  # You can replace this with the correct model path if you're using a specific version of Llama
# tokenizer = LlamaTokenizer.from_pretrained(model_name)
# model = LlamaForCausalLM.from_pretrained(model_name)

# # Step 2: Define a function to generate negative news
# def generate_negative_news(prompt):
#     # Encode the prompt into tokens
#     inputs = tokenizer(prompt, return_tensors="pt")

#     # Generate a response from the model
#     with torch.no_grad():
#         outputs = model.generate(
#             **inputs,
#             max_length=250,  # Maximum length of the output (can adjust as needed)
#             num_return_sequences=1,  # Number of sequences to generate
#             no_repeat_ngram_size=2,  # Avoid repeating n-grams (to improve output diversity)
#             top_p=0.95,  # Sampling technique
#             top_k=50,  # Number of top candidates to sample from
#             temperature=0.7,  # Control randomness (lower is more deterministic)
#             pad_token_id=tokenizer.eos_token_id
#         )

#     # Decode the generated tokens into text
#     generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     return generated_text

# # Step 3: Set up a prompt to generate negative news about organizations, individuals, or scandals
# prompt = (
#     "Generate a negative news article about a company or individual involved in a major scandal. "
#     "The company or individual should face accusations of corruption, fraud, or other illegal activity. "
#     "Include details like a lawsuit, CEO resignation, or criminal charges, and mention financial repercussions."
# )

# # Step 4: Generate and print the negative news article
# negative_news_article = generate_negative_news(prompt)
# print(negative_news_article)


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import transformers
print(transformers.__version__)
print(torch.__version__)

def load_model(model_path):
    device = "cuda"
    tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
    model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype=torch.bfloat16)
    return model, tokenizer
def prompt_model(model, tokenizer, prompt_user, prompt_system=None, verbose=True, max_new_token=128, temperature=0.5, top_p=0.9, do_sample=True):
    messages = []
    if prompt_system:
        messages.append({"role": "user", "content": prompt_system})
    messages.append({"role": "user", "content": prompt_user})
    input_text=tokenizer.apply_chat_template(messages, tokenize=False)
    inputs = tokenizer.encode(input_text, return_tensors="pt").to(model.device)
    outputs = model.generate(inputs, max_new_tokens=max_new_token, temperature=temperature, top_p=top_p, do_sample=do_sample)
    response = tokenizer.decode(outputs[0])
    if verbose:
        print(response)
    return response

4.48.2
2.5.1+cu124


In [None]:
# model, tokenizer = load_model("/h/ws_yuehuan_he/green-ai-explore/models/Llama-3.2-3b-Instruct")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
# Step 1: Load the model and tokenizer
model_name = "HuggingFaceTB/SmolLM2-1.7B-Instruct"  # You can replace this with the other model paths from hugging face
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name,device_map = "auto")

# Step 2: Check if the model is loaded on GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Model is loaded on: {device}")
# Step 3: Define a function to generate negative news
def generate_negative_news(prompt):
    # Encode the prompt into tokens
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate a response from the model
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=250,  # Maximum length of the output (can adjust as needed)
            num_return_sequences=1,  # Number of sequences to generate
            no_repeat_ngram_size=2,  # Avoid repeating n-grams (to improve output diversity)
            top_p=0.95,  # Sampling technique
            top_k=50,  # Number of top candidates to sample from
            temperature=0.7,  # Control randomness (lower is more deterministic)
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode the generated tokens into text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text
# Step 4: Set up a prompt to generate negative news about organizations, individuals, or scandals
prompt = (
    "Generate a negative news article about a company or individual involved in a major scandal. "
    "The company or individual should face accusations of corruption, fraud, or other illegal activity. "
    "Include details like a lawsuit, CEO resignation, or criminal charges, and mention financial repercussions."
)

# Step 5: Generate and print the negative news article
negative_news_article = generate_negative_news(prompt)
print(negative_news_article)



Model is loaded on: cuda




Generate a negative news article about a company or individual involved in a major scandal. The company or individual should face accusations of corruption, fraud, or other illegal activity. Include details like a lawsuit, CEO resignation, or criminal charges, and mention financial repercussions.


In [None]:

# Generate 2000 records in batches of 100
batch_size = 100  # Generate 100 records per batch
total_records = 2000
generated_records = []

# Loop to generate articles in batches
for i in range(total_records // batch_size):  # Adjust the range for batches
    batch = []
    for j in range(batch_size):
        article = generate_negative_news(prompt)
        batch.append(article)

    # Store the batch of generated articles
    generated_records.extend(batch)

    # Optionally print progress
    print(f"Generated batch {i+1}/{total_records // batch_size}.")

# Optionally: Save the generated records to a text file
with open("generated_negative_news.txt", "w") as f:
    for record in generated_records:
        f.write(record + "\n\n")

print(f"Generated {total_records} negative news articles.")

Generated batch 1/20.
Generated batch 2/20.
Generated batch 3/20.
Generated batch 4/20.
Generated batch 5/20.
Generated batch 6/20.
Generated batch 7/20.
Generated batch 8/20.
Generated batch 9/20.
Generated batch 10/20.
Generated batch 11/20.
Generated batch 12/20.
Generated batch 13/20.
Generated batch 14/20.
Generated batch 15/20.
Generated batch 16/20.
Generated batch 17/20.
Generated batch 18/20.
Generated batch 19/20.
