## Import Libraries

In [2]:
import os
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score
from nltk.tokenize import sent_tokenize
from imblearn.over_sampling import SMOTE
import chardet

In [3]:
import sys
!{sys.executable} -m pip install --user sentence-transformers





In [4]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight and fast


##  Detect file encoding 

In [6]:
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        raw_data = f.read(10000)  # Read first 10,000 bytes
    result = chardet.detect(raw_data)
    return result['encoding']

def read_file_with_encoding(file_path):
    encoding = detect_encoding(file_path)
    with open(file_path, 'r', encoding=encoding, errors='replace') as f:
        return f.read()

## Read Articles and their Summaries

In [8]:
# Specify dataset paths
articles_dir = r"C:\Users\prabh\OneDrive\Desktop\text summarizer2\News Articles"
summaries_dir = r"C:\Users\prabh\OneDrive\Desktop\text summarizer2\Summaries"

# Load articles and summaries
data = []
for category in os.listdir(articles_dir):
    category_path = os.path.join(articles_dir, category)
    for file_name in os.listdir(category_path):
        article_path = os.path.join(category_path, file_name)
        summary_path = os.path.join(summaries_dir, category, file_name)

        article = read_file_with_encoding(article_path)
        summary = read_file_with_encoding(summary_path)

        if article and summary:
            data.append({'category': category, 'article': article, 'summary': summary})

df = pd.DataFrame(data)

## Labelling Sentences as 1/0

In [10]:
# Sentence-Level Data Preparation
def prepare_sentence_level_data(df):
    sentence_data = []
    for _, row in df.iterrows():
        sentences = sent_tokenize(row['article'])
        summary_sentences = set(sent_tokenize(row['summary']))
        for sentence in sentences:
            label = 1 if sentence in summary_sentences else 0
            sentence_data.append({'sentence': sentence, 'label': label})
    return pd.DataFrame(sentence_data)

sentence_df = prepare_sentence_level_data(df)

## Vectorization

In [28]:

from sentence_transformers import SentenceTransformer
print(1)
embedder = SentenceTransformer('all-MiniLM-L6-v2')
print(1)

X = embedder.encode(sentence_df['sentence'], show_progress_bar=False)
print(1)
y = sentence_df['label']



1
1
1


## Train-Test Split

In [21]:
# Train-Test Split (changed to 70-30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply SMOTE to balance the dataset (only on the training data)
smote = SMOTE(sampling_strategy='minority', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

## Train Models

In [45]:
# Initialize Models with Class Weights
lr_model = LogisticRegression(max_iter=1000, class_weight='balanced')
svm_model = LinearSVC(class_weight='balanced',dual=False)


# Train models on the resampled dataset
print("Training models with class weights and SMOTE...")
lr_model.fit(X_resampled, y_resampled)
svm_model.fit(X_resampled, y_resampled)


Training models with class weights and SMOTE...


## Model Evaluation

In [47]:
# Model Evaluation
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"{model_name} Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy:.4f}")
    return f1, accuracy

# Re-evaluate Models with the 70-30 split and select the best model based on F1 score
lr_f1, lr_accuracy = evaluate_model(lr_model, X_test, y_test, "Logistic Regression with Class Weights")
svm_f1, svm_accuracy = evaluate_model(svm_model, X_test, y_test, "SVM with Class Weights")


Logistic Regression with Class Weights Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.85      0.91     12226
           1       0.06      0.59      0.11       200

    accuracy                           0.84     12426
   macro avg       0.53      0.72      0.51     12426
weighted avg       0.98      0.84      0.90     12426

Accuracy: 0.8433
SVM with Class Weights Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.85      0.92     12226
           1       0.06      0.56      0.11       200

    accuracy                           0.85     12426
   macro avg       0.53      0.71      0.51     12426
weighted avg       0.98      0.85      0.90     12426

Accuracy: 0.8482


## Comparing Models

In [49]:
# Compare F1 scores and return the best model
f1_scores = {'Logistic Regression': lr_f1, 'SVM': svm_f1}
best_model_name = max(f1_scores, key=f1_scores.get)
best_model = None
best_accuracy = 0

if best_model_name == 'Logistic Regression':
    best_model = lr_model
    best_accuracy = lr_accuracy
elif best_model_name == 'SVM':
    best_model = svm_model
    best_accuracy = svm_accuracy


print(f"The best model based on F1-score is: {best_model_name}")
print(f"Best Model Accuracy: {best_accuracy:.4f}")

The best model based on F1-score is: Logistic Regression
Best Model Accuracy: 0.8433


## Summarization Function

In [57]:
def summarize_new_article(article, model, embedder, top_k=3):
    sentences = sent_tokenize(article)
    sentence_features = embedder.encode(sentences)
    
    # Get probabilities from model (if using LogisticRegression)
    if hasattr(model, "predict_proba"):
        probs = model.predict_proba(sentence_features)[:, 1]
        print("Probabilities:", probs)
        
        # Get top_k sentence indexes
        top_indices = probs.argsort()[-top_k:][::-1]  # descending
        summary_sentences = [sentences[i] for i in sorted(top_indices)]
    else:
        # fallback to binary predictions
        predictions = model.predict(sentence_features)
        summary_sentences = [s for s, p in zip(sentences, predictions) if p == 1]
        if not summary_sentences:
            summary_sentences = sentences[:top_k]

    return " ".join(summary_sentences)




## Test the Best Model

In [59]:
# Test the Best Model with Debugging
new_article = """Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.

The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.

Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL's existing customers for high-speed broadband. TimeWarner also has to restate 2000 and 2003 results following a probe by the US Securities Exchange Commission (SEC), which is close to concluding.

Time Warner's fourth quarter profits were slightly better than analysts' expectations. But its film division saw profits slump 27% to $284m, helped by box-office flops Alexander and Catwoman, a sharp contrast to year-earlier, when the third and final film in the Lord of the Rings trilogy boosted results. For the full-year, TimeWarner posted a profit of $3.36bn, up 27% from its 2003 performance, while revenues grew 6.4% to $42.09bn. "Our financial performance was strong, meeting or exceeding all of our full-year objectives and greatly enhancing our flexibility," chairman and chief executive Richard Parsons said. For 2005, TimeWarner is projecting operating earnings growth of around 5%, and also expects higher revenue and wider profit margins.

TimeWarner is to restate its accounts as part of efforts to resolve an inquiry into AOL by US market regulators. It has already offered to pay $300m to settle charges, in a deal that is under review by the SEC. The company said it was unable to estimate the amount it needed to set aside for legal reserves, which it previously set at $500m. It intends to adjust the way it accounts for a deal with German music publisher Bertelsmann's purchase of a stake in AOL Europe, which it had reported as advertising revenue. It will now book the sale of its stake in AOL Europe as a loss on the value of that stake.

"""

summary = summarize_new_article(new_article, best_model, embedder)
print(f"Generated Summary Length: {len(summary)}")
print(f"Generated Summary: {summary}")

Probabilities: [0.04321244 0.05307496 0.16724177 0.01602757 0.09802657 0.01262532
 0.03885501 0.0604981  0.04533042 0.03436579 0.11575885 0.05957801
 0.00901181 0.0508742  0.03260606 0.03649188 0.00162508 0.02066183
 0.23707294 0.06594538]
Generated Summary Length: 331
Generated Summary: TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Time Warner's fourth quarter profits were slightly better than analysts' expectations. It intends to adjust the way it accounts for a deal with German music publisher Bertelsmann's purchase of a stake in AOL Europe, which it had reported as advertising revenue.
