In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os # Import os module for file path handling

# --- Configuration ---
# IMPORTANT: Replace 'news_dataset.csv' with the full, correct path to YOUR file.
# Example: 'C:/Users/YourUser/Documents/news_data/news_dataset.csv' on Windows
# Example: '/home/youruser/data/news_dataset.csv' on Linux/Mac
file_path = 'news_dataset.csv'

# Define column names based on the apparent structure in the snippet
column_names = ['url', 'content', 'tag', 'label']

# --- Load Data ---
try:
    # Check if the file exists before trying to read it
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file was not found at the specified path: {file_path}")

    # Load the CSV file using pandas, assuming no header row and specifying column names
    df = pd.read_csv(file_path, header=None, names=column_names)

    print("Data loaded successfully. First 5 rows:")
    print(df.head())
    print("\nData Info:")
    df.info()

    # --- Basic Data Cleaning ---
    # Remove leading/trailing whitespace from tag and label columns
    df['tag'] = df['tag'].str.strip()
    df['label'] = df['label'].str.strip()

    print("\nValue Counts for 'tag':")
    print(df['tag'].value_counts())
    print("\nValue Counts for 'label':")
    print(df['label'].value_counts())

    # --- Visualization 1: Distribution of News Tags ---
    plt.figure(figsize=(10, 6))
    # Use 'y' for horizontal bars if tag names are long
    sns.countplot(data=df, y='tag', order = df['tag'].value_counts().index, palette='viridis')
    plt.title('Distribution of News Articles by Tag')
    plt.xlabel('Number of Articles')
    plt.ylabel('Tag')
    plt.tight_layout() # Adjust layout to prevent labels overlapping
    plt.show()

    # --- Visualization 2: Distribution of Fake vs. Real News ---
    plt.figure(figsize=(7, 5))
    sns.countplot(data=df, x='label', order = df['label'].value_counts().index, palette='coolwarm')
    plt.title('Overall Distribution of Fake vs. Real News Articles')
    plt.xlabel('Label (Fake/Real)')
    plt.ylabel('Number of Articles')
    plt.tight_layout()
    plt.show()

    # --- Visualization 3: Fake vs. Real News Distribution within Each Tag ---
    plt.figure(figsize=(12, 7))
     # Use 'y' for horizontal bars
    sns.countplot(data=df, y='tag', hue='label', order = df['tag'].value_counts().index, palette='coolwarm')
    plt.title('Fake vs. Real News Distribution within Each Tag')
    plt.xlabel('Number of Articles')
    plt.ylabel('Tag')
    plt.legend(title='Label')
    plt.tight_layout()
    plt.show()

    # --- (Optional) Visualization 4: Heatmap of Tag vs. Label Counts ---
    # Create a cross-tabulation (contingency table)
    tag_label_counts = pd.crosstab(df['tag'], df['label'])

    plt.figure(figsize=(10, 7))
    sns.heatmap(tag_label_counts, annot=True, fmt='d', cmap='YlGnBu') # 'd' for integer format, YlGnBu is a color map
    plt.title('Heatmap of News Counts by Tag and Label')
    plt.xlabel('Label')
    plt.ylabel('Tag')
    plt.tight_layout()
    plt.show()


except FileNotFoundError as fnf_error:
    print(f"Error: {fnf_error}")
    print("Please ensure the 'file_path' variable in the script points to the correct location of your CSV file.")
except Exception as e:
    print(f"An error occurred: {e}")
    print("There might be an issue with the CSV format (e.g., delimiter, encoding) or the expected column structure.")

