In [1]:
import mysql.connector
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

In [2]:
def preprocess_text(text):
    if not isinstance(text, str):  # Check if text is not a string
        return ""  # Return an empty string if the text is not a string or is None
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove non-alphabetic characters
    tokens = word_tokenize(text)  # Tokenize text
    stop_words = set(stopwords.words('english'))  # Define stopwords
    tokens = [token for token in tokens if token not in stop_words]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()  # Initialize the Lemmatizer
    tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatize tokens
    return ' '.join(tokens)  # Re-join tokens into a string

In [3]:
def fetch_articles():
    db_config = {
        'host': 'localhost',
        'user': 'root',
        'password': 'Madgame@1',  # Replace with your actual password
        'database': 'bionexus_db_2'  # Ensure this matches the exact name of your database
    }
    
    # Connect to the database
    conn = mysql.connector.connect(**db_config)
    query = "SELECT id, title, abstract, link FROM articles;"  # Adjust SQL query as needed
    df = pd.read_sql(query, conn)
    conn.close()
    return df

In [4]:
def recommend_articles(input_word, articles_df, top_n=5):
    articles_df['processed_abstract'] = articles_df['abstract'].apply(preprocess_text)  # Preprocess texts
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(articles_df['processed_abstract'])
    input_vec = vectorizer.transform([input_word])
    similarities = cosine_similarity(input_vec, tfidf_matrix).flatten()
    top_indices = similarities.argsort()[-top_n:][::-1]
    recommended_articles = articles_df.iloc[top_indices]
    return recommended_articles[['id', 'title', 'abstract', 'link']]

In [5]:
if __name__ == "__main__":
    df_articles = fetch_articles()
    user_input = input("Enter a word to find related articles: ")
    recommendations = recommend_articles(user_input, df_articles, top_n=5)
    print("Recommended articles based on your input:")
    print(recommendations)

  df = pd.read_sql(query, conn)


Recommended articles based on your input:
      id                                              title  \
273  274        Medicolegal and insurance issues regarding    
173  174        Medicolegal and insurance issues regarding    
73    74        Medicolegal and insurance issues regarding    
373  374        Medicolegal and insurance issues regarding    
461  462  Overexpression of WT1 in all molecular subtype...   

                                              abstract  \
273  Hereditary breast and ovarian cancer syndrome ...   
173  Hereditary breast and ovarian cancer syndrome ...   
73   Hereditary breast and ovarian cancer syndrome ...   
373  Hereditary breast and ovarian cancer syndrome ...   
461  Breast cancer is a highly heterogeneous solid ...   

                                          link  
273  https://pubmed.ncbi.nlm.nih.gov/38642925/  
173  https://pubmed.ncbi.nlm.nih.gov/38642925/  
73   https://pubmed.ncbi.nlm.nih.gov/38642925/  
373  https://pubmed.ncbi.nlm.nih.g