<a href="https://colab.research.google.com/github/shruthimohan03/video-summarizer/blob/main/GMM_for_Extractive_Summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [42]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.mixture import GaussianMixture
from sklearn.metrics.pairwise import cosine_similarity
import re

In [43]:
# Step 1: Load and preprocess the text file
def load_and_preprocess(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    # Split text into sentences based on periods or question marks
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', content)
    # Clean sentences (keep full stops, remove other punctuations, and strip extra spaces)
    sentences = [
        re.sub(r'[^a-zA-Z0-9\s\.]', '', sentence).strip() for sentence in sentences if sentence.strip()
    ]
    return sentences

In [44]:
# Step 2: Preprocess and vectorize sentences
def preprocess_and_vectorize(sentences):
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    sentence_vectors = tfidf_vectorizer.fit_transform(sentences).toarray()
    return sentence_vectors

In [45]:
# Step 3: Fit GMM
def fit_gmm(sentence_vectors, n_clusters):
    gmm = GaussianMixture(n_components=n_clusters, random_state=42)
    gmm.fit(sentence_vectors)
    labels = gmm.predict(sentence_vectors)
    return labels

In [46]:
# Step 4: Extract representative sentences
def extract_summary(sentences, sentence_vectors, labels):
    summary = []
    unique_labels = np.unique(labels)
    for label in unique_labels:
        # Get indices of sentences in the current cluster
        cluster_indices = np.where(labels == label)[0]
        # Find the most central sentence in the cluster
        cluster_center = np.mean(sentence_vectors[cluster_indices], axis=0)
        central_index = cluster_indices[np.argmax(cosine_similarity([cluster_center], sentence_vectors[cluster_indices])[0])]
        summary.append(sentences[central_index])
    return ''.join(summary)

In [47]:
# Example
file_path = 'computer_lecture.txt'
sentences = load_and_preprocess(file_path)
sentence_vectors = preprocess_and_vectorize(sentences)

n_clusters = 6  # no of optimal is 6 for this dataset as found using elbow method
labels = fit_gmm(sentence_vectors, n_clusters)
summary = extract_summary(sentences, sentence_vectors, labels)

In [48]:
# Save the summarized text to a file
with open("extractive_summarization_gmm_centroid_method.txt", "w") as file:
    file.write(summary)

print("Summarization completed.")

Summarization completed.


In [49]:
summary

'One of the significant milestones in computer history was the invention of the internet.In conclusion computers are much more than machines they are catalysts of progress and innovation.The internet transformed computers from standalone devices into interconnected tools of communication and information exchange.Innovations such as quantum computing and advanced artificial intelligence promise to redefine our understanding of computation and problemsolving.However the widespread use of computers is not without challenges.At their core computers operate by processing data using binary logic.'