## TF-IDF Vectorizer + Random Forest Classifier NLU Module

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import joblib

# Load the crossword training dataset
df_train = pd.read_csv('crossword_train.csv', na_filter=False)

# Sample a subset of size n
df_train = df_train.sample(n=1000, random_state=42)

# Assuming you have 'clue' and 'answer' columns
X_train = df_train['clue'].astype(str)  # Convert to string to handle non-string data
y_train = df_train['answer']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Create a pipeline with TF-IDF vectorizer and Random Forest classifier
model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42, verbose=True, n_jobs=-1))
])

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set and get probabilities
y_pred_proba = model.predict_proba(X_test)

# Get the top 20 words for each clue based on probabilities
top_words = []
for proba_prediction in y_pred_proba:
    # Get indices of top 20 words
    top_indices = proba_prediction.argsort()[-2000:][::-1]
    
    # Get corresponding words from the target classes
    top_words.append(list(model.classes_[top_indices]))

# Evaluate the model
correct_predictions = 0
for i, true_answer in enumerate(y_test):
    if true_answer in top_words[i]:
        correct_predictions += 1
        print("Predicted Words:", top_words[i])
        print("True answer:", true_answer)
        print()

accuracy = correct_predictions / len(y_test)
print(f"Accuracy on the test set: {accuracy:.2%}")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   21.9s


In [None]:
# Save the trained model for later use
joblib.dump(model, 'crossword_model.pkl')

In [None]:
# Example usage: Make predictions on new clues and get probabilities
new_clues = ["Wrestling sport for gigantic Japanese", "Explorer or actor Sebastian"]
proba_predictions = model.predict_proba(new_clues)

# Get the top 20 words for each clue based on probabilities
top_words = []
for proba_prediction in proba_predictions:
    # Get indices of top 20 words
    top_indices = proba_prediction.argsort()[-20:][::-1]
    
    # Get corresponding words from the target classes
    top_words.append(list(model.classes_[top_indices]))

print("Top 20 Predictions for Each Clue:")
for i, clue_words in enumerate(top_words):
    print(f"Clue #{i + 1}: {new_clues[i]}")
    print("Predicted Words:", clue_words)
    print()