In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical




In [None]:
import glob

In [None]:
# Load datasets from a folder
path = 'cleandata/'  # specify your folder path
all_files = glob.glob(path + "*.csv")  # finds all csv files in the folder

# Read each CSV file and concatenate them into one DataFrame
data = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

# Combine columns into a single text per row, handling missing values
todo_texts = data.apply(lambda x: ' '.join(x.dropna().values.tolist()), axis=1)

In [None]:
todo_texts

0           Call name to discuss project details at time
1                        Water the plants in the morning
2           Grocery shopping at the local market at time
3                   Cancel gym membership over the phone
4        Prepare presentation slides for Mondays meeting
                              ...                       
19325                      Implement new morning routine
19326                        Attend friend's art exhibit
19327                               Purchase houseplants
19328                           File important documents
19329                            Back up smartphone data
Length: 19330, dtype: object

In [None]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(todo_texts)
sequences = tokenizer.texts_to_sequences(todo_texts)

# Create input sequences
input_sequences = []
for sequence in sequences:
    for i in range(1, len(sequence)):
        input_sequences.append(sequence[:i+1])

max_sequence_len = max(len(x) for x in input_sequences)
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))


In [None]:
max_sequence_len

42

In [None]:
len(input_sequences)

106805

In [None]:
# Predictors and label
predictors, label = input_sequences[:,:-1], input_sequences[:,-1]
label = to_categorical(label, num_classes=len(tokenizer.word_index) + 1)

# Model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=10, input_length=max_sequence_len - 1),
    LSTM(100),
    Dense(len(tokenizer.word_index) + 1, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
# Train the model
model.fit(predictors, label, epochs=50, verbose=1)



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7b53c034cbb0>

In [None]:
model.fit(predictors, label, epochs=80, verbose=1)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


<keras.src.callbacks.History at 0x7b53c0391e40>

In [None]:
# Prediction function
def predict_next_words(model, tokenizer, text, num_words=3):
    for _ in range(num_words):
        sequence = tokenizer.texts_to_sequences([text])[0]
        padded_sequence = pad_sequences([sequence], maxlen=max_sequence_len-1, padding='pre')
        predictions = model.predict(padded_sequence)[0]
        predicted_word_indices = np.argsort(predictions)[-num_words:]
        for idx in predicted_word_indices:
            output_word = tokenizer.index_word.get(idx, '')
            print(output_word)

In [None]:
predict_next_words(model,tokenizer,"have a meeting with")

potential
a
name
potential
a
name
potential
a
name


In [None]:
def predict_next_words(model, tokenizer, text, num_words=15):
    current_text = text
    for _ in range(num_words):
        sequence = tokenizer.texts_to_sequences([current_text])[0]
        padded_sequence = pad_sequences([sequence], maxlen=max_sequence_len-1, padding='pre')
        predictions = model.predict(padded_sequence)[0]
        next_word_index = np.argmax(predictions)
        next_word = tokenizer.index_word.get(next_word_index, '')
        current_text += ' ' + next_word
        print(next_word)


In [None]:
predict_next_words(model,tokenizer,"buy")

a
new
charger
for
the
laptop
next
month
and
adjust
on
app
for
next
month


In [None]:
model.save('my_model.h5')


  saving_api.save_model(


In [None]:
# 保存模型为 Keras 推荐的格式
model.save('my_model.keras')


In [None]:
import pickle

with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
