In [None]:
import os
print("cwd:", os.getcwd())
print("train_data contains:", os.listdir("train_data"))


In [None]:
import os
from pathlib import Path

# 1) Print your cwd
print("cwd:", os.getcwd())

# 2) Check that train_data exists
print("train_data folder exists?", os.path.isdir("train_data"))

# 3) Check that news.tsv exists under it
print("news.tsv exists?", os.path.isfile("train_data/news.tsv"))

# 4) Show the full resolved path
p = Path("train_data") / "news.tsv"
print("Resolved path:", p.resolve())
print("Path exists?", p.exists())


In [None]:
import pandas as pd
import numpy as np

# Load news data
news_cols = ["newsID", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]
news_df = pd.read_csv("train_data/news.tsv", sep="\t", header=None, names=news_cols)

In [None]:

# Check for duplicates
duplicate_news = news_df.duplicated(subset=["newsID"]).sum()
print(f"Duplicate news IDs: {duplicate_news}")

# Check unique titles
duplicate_titles = news_df[news_df.duplicated(subset=["title"], keep=False)]
print(f"Duplicate news titles: {len(duplicate_titles)}")

# Missing values
print("\nMissing values in news data:")
print(news_df.isnull().sum())
print(f"Percentage of abstracts missing: {news_df['abstract'].isnull().mean()*100:.2f}%")

# Check title and abstract lengths
news_df["title_length"] = news_df["title"].str.len()
news_df["abstract_length"] = news_df["abstract"].fillna("").str.len()

# Identify very short titles/abstracts (potential quality issues)
print(f"\nVery short titles (<10 chars): {(news_df['title_length'] < 10).sum()}")
print(f"Very short abstracts (<20 chars): {(news_df['abstract_length'] < 20).sum()}")

# Examine category distribution
category_counts = news_df["category"].value_counts()
print("\nCategory distribution:")
print(category_counts)

# Potential adjustments based on exploration:
# 1. Filter out articles with missing abstracts if needed
# 2. Set minimum length requirements for titles and abstracts
# 3. Balance categories if severely imbalanced