In [None]:
import pandas as pd

# URL of the raw CSV file
url = "https://raw.githubusercontent.com/shreyasahay99/news_categorization_llm/main/sampled_news_data.csv"

# Read the CSV file
df = pd.read_csv(url)

# Display the first few rows
print(df.head())


                                                link  \
0  https://www.huffingtonpost.com/entry/how-to-ma...   
1  https://www.huffingtonpost.com/entry/uber-ad-n...   
2  https://www.huffingtonpost.com/entry/the-progr...   
3  https://www.huffingtonpost.com/entry/dont-let-...   
4  https://www.huffingtonpost.com/entry/what-you-...   

                                            headline  category  \
0                  How to Manage Your Personal Brand  BUSINESS   
1  It Looks Like Uber's Winning Its War With New ...  BUSINESS   
2      The Progressive Promise of Today's Technology  BUSINESS   
3   Don't Let These 5 Confusing Words Mar Your Image  BUSINESS   
4        What You Don't Know About Overnight Success  BUSINESS   

                                   short_description  \
0  Make no mistake: If you have a Facebook accoun...   
1                                  Grab the popcorn.   
2  A digital policy for the new century, tailored...   
3  Tom's an articulate physician, totally 

In [None]:
df["category"].value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
BUSINESS,2000
CRIME,2000
ENTERTAINMENT,2000
POLITICS,2000
SCIENCE,2000
SPORTS,2000
STYLE & BEAUTY,2000
TECH,2000
TRAVEL,2000
WELLNESS,2000


In [None]:
df.columns

Index(['link', 'headline', 'category', 'short_description', 'authors', 'date'], dtype='object')

In [None]:
df.drop(columns=["link", "authors", "date"], inplace=True)

In [None]:
df_cat = df['category'].value_counts().reset_index()

# Rename the columns for clarity
df_cat.columns = ['category', 'count']

# Show the resulting DataFrame
print(df_cat)


          category  count
0         BUSINESS   2000
1            CRIME   2000
2    ENTERTAINMENT   2000
3         POLITICS   2000
4          SCIENCE   2000
5           SPORTS   2000
6   STYLE & BEAUTY   2000
7             TECH   2000
8           TRAVEL   2000
9         WELLNESS   2000
10      WORLD NEWS   2000


In [None]:
missing_values_count = df.isnull().sum()

# Print the count of missing values per column
print("Missing Values Count per Column:")
print(missing_values_count)



Missing Values Count per Column:
headline                1
category                0
short_description    1838
dtype: int64


In [None]:
# Replace missing 'headline' values with the corresponding 'short_description' values in the same row
df.loc[df['headline'].isnull(), 'headline'] = df['short_description']

# Fill missing values in 'short_description' with the updated 'headline' values
df['short_description'] = df['short_description'].fillna(df['headline'])

# Verify that there are no more missing values
missing_values_after_handling = df.isnull().sum()

# Print the count of missing values after handling
print("Missing Values Count After Handling:")
print(missing_values_after_handling)


Missing Values Count After Handling:
headline             1
category             0
short_description    1
dtype: int64


In [None]:
# Drop the row where both 'headline' and 'short_description' are missing
df = df.dropna(subset=['headline', 'short_description'])

# Verify that there are no more missing values
missing_values_after_handling = df.isnull().sum()

# Print the updated count of missing values
print("Missing Values Count After Handling:")
print(missing_values_after_handling)


Missing Values Count After Handling:
headline             0
category             0
short_description    0
dtype: int64


In [None]:
import pandas as pd
import re
from transformers import BertTokenizer

# Load the BERT tokenizer (e.g., for DistilBERT or BERT-base-uncased)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Function to clean text while keeping digits and hyphens
def clean_text(text):
    if pd.isna(text):
        return ""

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove punctuation except for hyphens (keep digits and hyphenated words)
    text = re.sub(r'[^\w\s\-]', '', text)

    # Convert to lowercase
    text = text.lower()

    # Return the cleaned text without additional stopword removal
    return text

# Apply the text cleaning function
df['cleaned_headline'] = df['headline'].apply(clean_text)
df['cleaned_short_description'] = df['short_description'].apply(clean_text)

# Combine cleaned headline and short description
df['combined_text'] = df['cleaned_headline'] + " " + df['cleaned_short_description']

# Step 1: Tokenize without truncation or padding to find the true length of each tokenized sequence
df['input_ids'] = df['combined_text'].apply(lambda x: tokenizer.encode(x, truncation=False, padding=False))

# Step 2: Find the maximum token length in the dataset
max_length = df['input_ids'].apply(len).max()
print("Maximum token length in the dataset:", max_length)

# Step 3: Update the tokenization step using the determined max_length
df['input_ids'] = df['combined_text'].apply(lambda x: tokenizer.encode(x, truncation=True, padding='max_length', max_length=max_length))

# Display a sample of the cleaned and tokenized data
print(df[['headline', 'cleaned_headline', 'short_description', 'cleaned_short_description', 'combined_text', 'input_ids']].head())

Maximum token length in the dataset: 284
                                            headline  \
0                  How to Manage Your Personal Brand   
1  It Looks Like Uber's Winning Its War With New ...   
2      The Progressive Promise of Today's Technology   
3   Don't Let These 5 Confusing Words Mar Your Image   
4        What You Don't Know About Overnight Success   

                                    cleaned_headline  \
0                  how to manage your personal brand   
1  it looks like ubers winning its war with new york   
2       the progressive promise of todays technology   
3    dont let these 5 confusing words mar your image   
4         what you dont know about overnight success   

                                   short_description  \
0  Make no mistake: If you have a Facebook accoun...   
1                                  Grab the popcorn.   
2  A digital policy for the new century, tailored...   
3  Tom's an articulate physician, totally able to...   
4  I'