In [35]:
import re
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
import os
import pandas as pd

In [57]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [58]:
# Function to clean text
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '[URL]', text)

    # Remove non-alphanumeric characters (except spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Tokenize text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into a cleaned text
    cleaned_text = ' '.join(tokens)

    return cleaned_text

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

In [59]:
# Function to extract top 5 keywords from text
def extract_keywords(text):
    doc = nlp(text)
    keywords = [token.text for token in doc if token.pos_ in ["PROPN", "NOUN"] and token.is_stop == False]
    return " ".join(keywords[:8])  # Return top 8 keywords

In [60]:
base_dir = os.getcwd()  # Get the script's directory
output_dir = os.path.join(base_dir, "..", "..", "output")

file_path1 = os.path.join(output_dir, "finalized_data.xlsx")
df1 = pd.read_excel(file_path1)

file_path2 = os.path.join(output_dir, "final_preprocessed_data.xlsx")
df2 = pd.read_excel(file_path2)

In [61]:
df3 = pd.merge(df1, df2[['Message_ID', 'Date']], on='Message_ID', how='left')

In [62]:
cols = ['DOY_Sine', 'DOY_Cosine', 'Month_Sine', 'Month_Cosine',
       'DayOfWeek_Sine', 'DayOfWeek_Cosine', 'Channel_x',
       'Total_Comments', 'Comment_Count', 'Text_Positive_Reactions',
       'Text_Negative_Reactions', 'Cluster_0', 'Cluster_1', 'Cluster_2',
       'Sentiment_negative', 'Sentiment_neutral', 'Sentiment_positive',
       'Sentiment_Comments_negative', 'Sentiment_Comments_neutral',
       'Sentiment_Comments_positive',]
df3.drop(cols, axis=1, inplace=True)

del df1, df2

In [63]:
# Apply the cleaning function to the 'Combined_text' column
df3['Cleaned_text'] = df3['Combined_text'].apply(clean_text)

In [64]:
df3.drop(['Combined_text'], axis=1, inplace=True)

In [65]:
df3["word_count"] = df3["Cleaned_text"].str.split().str.len()
df3 = df3[df3["word_count"] >= 10]
df3 = df3.drop(columns=["word_count"])  # Remove the temporary column if not needed

In [66]:
df3["Query_Text"] = df3["Cleaned_text"].apply(lambda x: extract_keywords(str(x)))  # Ensure text format

In [67]:
len(df3)

1554

In [68]:
file_path3 = os.path.join(output_dir, "data_for_searching_news.xlsx")
df3.to_excel(file_path3, index=False)