In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

In [None]:
# Load your final preprocessed DataFrame
import pandas as pd
df = pd.read_pickle('final_nlp_data.pkl')  # Adjust path as needed

In [None]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['clean_text'])

In [None]:
# Fit Nearest Neighbors
nn = NearestNeighbors(metric='cosine', algorithm='brute')
nn.fit(tfidf_matrix)

In [None]:
# Recommend Function
def recommend_articles_nn(title, df=df, top_n=10):
    if title not in df['title'].values:
        return f"'{title}' not found."

    idx = df[df['title'] == title].index[0]
    query_vector = tfidf_matrix[idx]

    distances, indices = nn.kneighbors(query_vector, n_neighbors=top_n+1)
    
    # Skip self-match
    similar_indices = indices.flatten()[1:]
    
    return df[['title', 'url']].iloc[similar_indices]

In [None]:
recommendations = recommend_articles_nn("Mental Note Vol. 24")
print(recommendations)

### Next word Prediction

In [None]:
sampled_df = df.sample(n=10000, random_state=42)

In [None]:
from nltk.tokenize import sent_tokenize

all_sentences = []
for doc in sampled_df['clean_text']:
    all_sentences.extend(sent_tokenize(doc))

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_sentences)

total_words = len(tokenizer.word_index) + 1

In [None]:
sampled_sentences = all_sentences[:5000]

In [None]:
from tqdm import tqdm

input_sequences = []

for line in tqdm(sampled_sentences, desc="Generating n-gram sequences"):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_seq = token_list[:i+1]
        input_sequences.append(n_gram_seq)

In [None]:
# Save to file
with open('intermediate_data/input_sequences.pkl', 'wb') as f:
    pickle.dump(input_sequences, f)


# Free memory
del input_sequences
gc.collect()

In [None]:
input_sequences = pd.read_pickle('intermediate_data/input_sequences.pkl')

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Pad to same length
max_seq_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')

# Split
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

# One-hot encode labels
from tensorflow.keras.utils import to_categorical
y = to_categorical(y, num_classes=total_words)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_seq_len-1))
model.add(LSTM(150, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(X, y, epochs=10, verbose=1, batch_size=128)

In [None]:
def generate_next_words(seed_text, next_words=20):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        output_word = tokenizer.index_word[np.argmax(predicted)]
        seed_text += " " + output_word
    return seed_text

In [None]:
generate_next_words("Hello Everyone")