In [None]:
!pip install requests beautifulsoup4 pandas tqdm newspaper3k

import json
import pandas as pd
import requests
from newspaper import Article
from tqdm import tqdm
import re
import os
import time
from google.colab import drive

drive.mount('/content/drive')


data_path = '/content/drive/My Drive/Colab Notebooks/Project/Sarcasm_Headlines_Dataset_v2.json'
valid_data = []
with open(data_path, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            valid_data.append(json.loads(line.strip()))
        except json.JSONDecodeError as e:
            print(f"Skipping invalid line: {line.strip()} (Error: {e})")
            continue

df = pd.DataFrame(valid_data)
print("Number of valid rows:", len(df))
print(df.head())
print("Columns:", df.columns)

if df.empty:
    raise ValueError("Dataset not found")

def normalize_link(link):
    link = re.sub(r'https?://(?:\w+\.)?theonion\.com/', 'https://www.theonion.com/', link)
    link = re.sub(r'https?://(?:\w+\.)?huffingtonpost\.com/', 'https://www.huffpost.com/', link)
    return link

def fetch_full_article(original_link):
    try:
        link = normalize_link(original_link)
        print(f"Attempting to fetch article from: {link}")
        article = Article(link)
        article.download()
        article.parse()
        article_text = article.text.strip()

        if not article_text:
            raise ValueError("No text extracted from direct link")

        article_text = re.sub(r'\s+', ' ', article_text).strip()
        article_text = re.sub(r'[^\w\s.,!?]', '', article_text)

        print(f"Successfully fetched article from {link} (Length: {len(article_text)} characters)")
        return article_text
    except Exception as e:
        print(f"Direct fetch failed for {original_link}: {e}. Trying Wayback Machine...")

        try:
            wayback_api = f"http://archive.org/wayback/available?url={original_link}"
            response = requests.get(wayback_api, timeout=10)
            response.raise_for_status()
            data = response.json()

            if data.get('archived_snapshots', {}).get('closest', {}).get('available', False):
                archived_url = data['archived_snapshots']['closest']['url']
                print(f"Attempting to fetch from Wayback Machine: {archived_url}")
                article = Article(archived_url)
                article.download()
                article.parse()
                article_text = article.text.strip()

                if not article_text:
                    raise ValueError("No text extracted from archive")

                # Preprocess
                article_text = re.sub(r'\s+', ' ', article_text).strip()
                article_text = re.sub(r'[^\w\s.,!?]', '', article_text)

                print(f"Successfully fetched article from Wayback Machine for {original_link} (Length: {len(article_text)} characters)")
                return article_text
            else:
                print(f"No archive available for {original_link}. Skipping...")
                return None
        except Exception as archive_e:
            print(f"Wayback fetch failed for {original_link}: {archive_e}. Skipping...")
            return None

def save_to_csv(data, output_path, mode='w'):
    temp_df = pd.DataFrame(data)
    temp_df.to_csv(output_path, index=False, encoding='utf-8', mode=mode)
    print(f"Saved {len(temp_df)} articles to {output_path}")

new_data = []
success_count = 0
batch_size = 500
output_path = '/content/drive/My Drive/Colab Notebooks/Project/Extended_Sarcasm_Dataset.csv'

if not os.path.exists(output_path):
    pd.DataFrame(columns=['headline', 'full_text', 'is_sarcastic', 'article_link']).to_csv(output_path, index=False, encoding='utf-8')

for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing articles"):
    print(f"\nProcessing article {index + 1}/{len(df)}: {row['headline']}")
    full_text = fetch_full_article(row['article_link'])
    if full_text:
        new_data.append({
            'headline': row['headline'],
            'full_text': full_text,
            'is_sarcastic': row['is_sarcastic'],
            'article_link': row['article_link']
        })
        success_count += 1
        print(f"Added to dataset (Success count: {success_count})")

        if success_count % batch_size == 0:
            save_to_csv(new_data, output_path, mode='a')
            print(f"Cleared batch data after saving {success_count} articles")
            new_data = []
    else:
        print(f"Skipped article {index + 1} due to fetch failure")
    time.sleep(1)

if new_data:
    save_to_csv(new_data, output_path, mode='a')

final_df = pd.read_csv(output_path)
print(f"Number of valid rows in final dataset: {len(final_df)}")
print(f"Total articles successfully fetched: {success_count}/{len(df)}")
print(final_df.head())