In [1]:
import zipfile
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Importing the Data

In [2]:
# Unzip the dataset
zip_file_path = '/content/bbc-fulltext.zip'
extracted_folder_path = '/content/bbc-fulltext'

In [3]:
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_folder_path)

### Reading the Dataset

In [4]:
# Read the dataset
articles = []
file_paths = []

In [5]:
# Assuming the structure has subfolders for different categories
for root, dirs, files in os.walk(extracted_folder_path):
    for file in files:
        if file.endswith(".txt"):
            file_path = os.path.join(root, file)
            with open(file_path, 'r', encoding='latin1') as f:
                articles.append(f.read())
                file_paths.append(file_path)

# Creating DataFrame

In [6]:
# Create a DataFrame
df = pd.DataFrame({
    'file_path': file_paths,
    'content': articles
})

# Statement of Purpose

In [7]:
# The statement to compare against
statement = "Consuming alcohol and getting drunk occasionally is not a good advice."

# Calculating the Similarity

In [8]:
# Vectorize the content and the statement
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['content'].tolist() + [statement])

In [9]:
# Compute cosine similarity
cosine_similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()

# Determining the Top 5 Articles

In [10]:
# Get the top 5 most similar articles
top_5_indices = cosine_similarities.argsort()[-5:][::-1]

In [11]:
# Extract the top 5 articles
top_5_articles = df.iloc[top_5_indices]

In [12]:
# Print the top 5 articles' file paths and their similarity scores
for idx in top_5_indices:
    print(f"File: {df.iloc[idx]['file_path']}")
    print(f"Similarity Score: {cosine_similarities[idx]}")
    print(f"Content: {df.iloc[idx]['content'][:500]}...")  # Print the first 500 characters of the article
    print("\n---\n")

File: /content/bbc-fulltext/bbc/politics/019.txt
Similarity Score: 0.1785399835189493
Content: Drink remark 'acts as diversion'

The first minister's statement that it was okay to get drunk "once in a while" has diverted attention from the real issues, it has been claimed.

Jack Law, chief executive of Alcohol Focus Scotland, said Jack McConnell's comment was "ill-advised". The media attention had helped to move the focus from Scotland's binge drinking problems, Mr Law said. Scotsman journalist Hamish MacDonell said he believed the bigger picture had been "obscured" by the remark. Mr McCo...

---

File: /content/bbc-fulltext/bbc/politics/094.txt
Similarity Score: 0.15224649428838585
Content: McConnell in 'drunk' remark row

Scotland's first minister has told a group of high school pupils that it is okay to get drunk "once in a while".

Jack McConnell was speaking to more than 100 secondary pupils from schools in the Highlands about the problems of binge drinking and drink promotions. H

In [13]:
top_5_articles

Unnamed: 0,file_path,content
769,/content/bbc-fulltext/bbc/politics/019.txt,Drink remark 'acts as diversion'\n\nThe first ...
592,/content/bbc-fulltext/bbc/politics/094.txt,McConnell in 'drunk' remark row\n\nScotland's ...
845,/content/bbc-fulltext/bbc/politics/324.txt,Iraq advice claim sparks new row\n\nThe Tories...
705,/content/bbc-fulltext/bbc/politics/184.txt,Blair rejects Iraq advice calls\n\nTony Blair ...
601,/content/bbc-fulltext/bbc/politics/327.txt,Goldsmith denies war advice claim\n\nThe attor...
