In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.datasets import fetch_20newsgroups

In [None]:
train = fetch_20newsgroups(subset='train', shuffle=True, remove = ('headers', 'footers', 'quotes'))

In [None]:
len(train['data'])

11314

In [None]:
train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [None]:
targets, frequency = np.unique(train.target, return_counts=True)
targets_str = np.array(train.target_names)
print(list(zip(targets_str, frequency)))

[('alt.atheism', 480), ('comp.graphics', 584), ('comp.os.ms-windows.misc', 591), ('comp.sys.ibm.pc.hardware', 590), ('comp.sys.mac.hardware', 578), ('comp.windows.x', 593), ('misc.forsale', 585), ('rec.autos', 594), ('rec.motorcycles', 598), ('rec.sport.baseball', 597), ('rec.sport.hockey', 600), ('sci.crypt', 595), ('sci.electronics', 591), ('sci.med', 594), ('sci.space', 593), ('soc.religion.christian', 599), ('talk.politics.guns', 546), ('talk.politics.mideast', 564), ('talk.politics.misc', 465), ('talk.religion.misc', 377)]


In [None]:
test = fetch_20newsgroups(subset='test', shuffle=True, remove = ('headers', 'footers', 'quotes'))

In [None]:
len(test['data'])

7532

In [None]:
targets_test, frequency_test = np.unique(test.target, return_counts=True)
targets_str_test = np.array(test.target_names)
print(list(zip(targets_str_test, frequency_test)))

[('alt.atheism', 319), ('comp.graphics', 389), ('comp.os.ms-windows.misc', 394), ('comp.sys.ibm.pc.hardware', 392), ('comp.sys.mac.hardware', 385), ('comp.windows.x', 395), ('misc.forsale', 390), ('rec.autos', 396), ('rec.motorcycles', 398), ('rec.sport.baseball', 397), ('rec.sport.hockey', 399), ('sci.crypt', 396), ('sci.electronics', 393), ('sci.med', 396), ('sci.space', 394), ('soc.religion.christian', 398), ('talk.politics.guns', 364), ('talk.politics.mideast', 376), ('talk.politics.misc', 310), ('talk.religion.misc', 251)]


In [None]:
df = pd.DataFrame({'data': train.data, 'target': train.target})
df.head()

Unnamed: 0,data,target
0,I was wondering if anyone out there could enli...,7
1,A fair number of brave souls who upgraded thei...,4
2,"well folks, my mac plus finally gave up the gh...",4
3,\nDo you have Weitek's address/phone number? ...,1
4,"From article <C5owCB.n3p@world.std.com>, by to...",14


In [None]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from rouge import Rouge
from sklearn.model_selection import train_test_split

# Fetching a subset of the 20newsgroup dataset
newsgroups_train = fetch_20newsgroups(subset='train', categories=['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'], shuffle=True, random_state=42)

# Preprocessing the dataset
df = pd.DataFrame({'text': newsgroups_train.data, 'target': newsgroups_train.target})
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
summaries = []

# Advanced text preprocessing
for text in df['text']:
    words = word_tokenize(text)
    # Additional preprocessing steps can be added here
    filtered_words = [word for word in words if word.casefold() not in stop_words and word.isalnum()]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    summaries.append(' '.join(lemmatized_words))
df['summary'] = summaries

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['summary'], test_size=0.2, random_state=42)

# Optimizing TF-IDF vectorization
tfidf = TfidfVectorizer(max_df=0.8, min_df=0.2, ngram_range=(1, 2))
tfidf_matrix_train = tfidf.fit_transform(X_train)

# Implementing TextRank on the training set
similarity_matrix_train = cosine_similarity(tfidf_matrix_train)
nx_graph_train = nx.from_numpy_array(similarity_matrix_train)
scores_train = nx.pagerank(nx_graph_train)

# Extracting the top-ranked sentences from the training set
num_sentences_to_extract = 10  # Adjust as needed
ranked_sentences_train = sorted(((scores_train[i], s) for i, s in enumerate(X_train)), reverse=True)
top_sentences_train = [ranked_sentences_train[i][1] for i in range(num_sentences_to_extract)]

# Generating the summary for the training set
summary_train = ' '.join(top_sentences_train)

# Implementing TF-IDF on the testing set
tfidf_matrix_test = tfidf.transform(X_test)

# Implementing TextRank on the testing set
similarity_matrix_test = cosine_similarity(tfidf_matrix_test)
nx_graph_test = nx.from_numpy_array(similarity_matrix_test)
scores_test = nx.pagerank(nx_graph_test)

# Extracting the top-ranked sentences from the testing set
ranked_sentences_test = sorted(((scores_test[i], s) for i, s in enumerate(X_test)), reverse=True)
top_sentences_test = [ranked_sentences_test[i][1] for i in range(num_sentences_to_extract)]

# Generating the summary for the testing set
summary_test = ' '.join(top_sentences_test)

# Evaluating the model performance
rouge = Rouge()
scores = rouge.get_scores([summary_train, summary_test], [y_train.iloc[0], y_test.iloc[0]], avg=True)
print("ROUGE scores:", scores)

ROUGE scores: {'rouge-1': {'r': 0.7069696843681125, 'p': 0.019309401461165523, 'f': 0.03759184973198946}, 'rouge-2': {'r': 0.15338259755133204, 'p': 0.0017729936039811533, 'f': 0.0035054090256710717}, 'rouge-l': {'r': 0.6997754397637959, 'p': 0.01911664895538295, 'f': 0.03721640389005216}}


In [None]:
# Printing the results
print("Training set summary:", summary_train)

In [None]:
print("Testing set summary:", summary_test)