<a href="https://colab.research.google.com/github/selete-tetteh/News-Recommendation-System/blob/main/News_Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Load libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import ward, dendrogram
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import re
nltk.download('stopwords')

In [None]:
# Load the data
news_data = pd.read_csv('/content/result_final.csv', usecols=['date', 'title', 'text', 'link'])

# Drop rows with missing values
news_data.dropna(inplace=True)

In [None]:
# Apply text preprocessing functions
def make_lowercase(text):
    return text.lower()

def remove_stopwords(text):
    stops = set(stopwords.words("english"))
    text = [w for w in text.split() if w not in stops and w.isalpha()]
    return " ".join(text)

def remove_punctuation_marks(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    return " ".join(text)

def remove_html_tags(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

In [None]:
news_data['cleaned_desc'] = news_data['text'].apply(make_lowercase).apply(remove_stopwords).apply(remove_punctuation_marks).apply(remove_html_tags)

# Drop duplicate rows
news_data.drop_duplicates(subset=None, keep='first', inplace=True)

# Add ID column
news_data.insert(0, 'id', range(news_data.shape[0]))

In [None]:
def preprocess_data(df):
    tf = TfidfVectorizer(analyzer='word', stop_words='english', max_df=0.8, min_df=0.0, use_idf=True, ngram_range=(1,3))
    tfidf_matrix = tf.fit_transform(df['cleaned_desc'])
    return tfidf_matrix, tf.get_feature_names_out() 

def cluster_articles(tfidf_matrix, num_clusters):
    km = KMeans(n_clusters=num_clusters)
    km.fit(tfidf_matrix)
    clusters = km.labels_.tolist()
    return clusters

def recommend_similar_articles(df, article_id, num_articles, tfidf_matrix, clusters):
    idx = df.index[df['id'] == article_id].tolist()[0]
    similarity_score = list(enumerate(cosine_similarity(tfidf_matrix[idx], tfidf_matrix)[0]))
    similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    similarity_score = similarity_score[1:num_articles + 1]
    news_indices = [i[0] for i in similarity_score]
    print("Article Read --", df['title'].iloc[idx], "link:", df['link'].iloc[idx])
    print(" ---------------------------------------------------------- ")
    for i, news_index in enumerate(news_indices, start=1):
        print(f"Recommendation {i}: {df['title'].iloc[news_index]} || Link --{df['link'].iloc[news_index]} (score: {similarity_score[i-1][1]:.2f})")


In [None]:
df = news_data

In [None]:
# Preprocess the data
tfidf_matrix, feature_names = preprocess_data(df)

In [None]:
# Cluster the articles
num_clusters = 5
clusters = cluster_articles(tfidf_matrix, num_clusters)

In [None]:
# Recommend similar articles for a given article ID
article_id = 20
num_articles = 15
recommend_similar_articles(df, article_id, num_articles, tfidf_matrix, clusters)