<a href="https://colab.research.google.com/github/rmiddha03/A2Task2/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import urllib.parse
import string

# ---------- HELPER FUNCTION TO EXTRACT ARTICLE TITLE ----------
def extract_article_title(input_string):
    input_string = input_string.strip()
    if input_string.startswith("http"):
        # Remove any trailing slash, then extract last part
        title = input_string.rstrip("/").split("/")[-1]
        # Remove any trailing punctuation (like a period)
        title = title.strip(string.punctuation)
        return title
    return input_string.strip().strip(string.punctuation)

# ---------- USER INPUTS ----------
article1_input = input("Enter the first Wikipedia article title or URL: ")
article2_input = input("Enter the second Wikipedia article title or URL: ")
start_date_str = input("Enter start date (YYYY-MM-DD): ")
end_date_str   = input("Enter end date (YYYY-MM-DD): ")

article1 = extract_article_title(article1_input)
article2 = extract_article_title(article2_input)

start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
end_date   = datetime.strptime(end_date_str, "%Y-%m-%d")

# ---------- HELPER FUNCTION TO FETCH PAGEVIEWS ----------
def get_daily_pageviews(article_title, start_dt, end_dt):
    # Replace spaces with underscores, then URL-encode the article title
    article_title = article_title.replace(" ", "_")
    article_title = urllib.parse.quote(article_title)

    start_str = start_dt.strftime("%Y%m%d")
    end_str   = end_dt.strftime("%Y%m%d")

    base_url = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article"
    # Use 'user' as the agent (not 'page')
    url = f"{base_url}/en.wikipedia.org/all-access/user/{article_title}/daily/{start_str}/{end_str}"

    headers = {
        "User-Agent": "WikiPageviewsFetcher/1.0 (nipunbhalla80@gmail.com)"
    }

    resp = requests.get(url, headers=headers)
    if resp.status_code != 200:
        print(f"Error fetching data for {article_title}: {resp.status_code}")
        return pd.DataFrame()

    data = resp.json()
    if 'items' not in data:
        print(f"No data found for {article_title}")
        return pd.DataFrame()

    all_records = []
    for item in data['items']:
        date_str = item['timestamp'][:8]  # YYYYMMDD
        date_obj = datetime.strptime(date_str, "%Y%m%d")
        views    = item['views']
        all_records.append((date_obj, views))

    df = pd.DataFrame(all_records, columns=['date', 'views'])
    df.sort_values('date', inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

# ---------- FETCH DATA ----------
df1 = get_daily_pageviews(article1, start_date, end_date)
df2 = get_daily_pageviews(article2, start_date, end_date)

# Rename columns so we can merge
df1.rename(columns={'views': f"{article1}_views"}, inplace=True)
df2.rename(columns={'views': f"{article2}_views"}, inplace=True)

if df1.empty or df2.empty:
    print("One or both dataframes are empty. Check article titles and date range.")
else:
    merged_df = pd.merge(df1, df2, on='date', how='outer')
    merged_df.sort_values('date', inplace=True)
    merged_df.fillna(0, inplace=True)

    # ---------- PLOT ----------
    plt.figure(figsize=(10,5))
    plt.plot(merged_df['date'], merged_df[f"{article1}_views"], label=article1)
    plt.plot(merged_df['date'], merged_df[f"{article2}_views"], label=article2)
    plt.title("Wikipedia Pageviews Comparison")
    plt.xlabel("Date")
    plt.ylabel("Daily Views")
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    # ---------- DISPLAY ----------
    print(merged_df.head(30))
