In [12]:
import numpy as np
import pandas as pd
import torch
import string
import json
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from transformers import DistilBertTokenizer, DistilBertModel
import scipy.sparse as sp
from tensorflow.keras.utils import to_categorical

In [13]:
class TextPreprocessor:
    
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
    
    def preprocess(self, text):
        # Remove punctuation and convert to lowercase
        text = ''.join(char.lower() for char in text if char not in string.punctuation)

        # Tokenize, remove stopwords, and lemmatize
        filtered_words = [word for word in text.split() if word not in self.stop_words]
        lemmatized_words = [self.lemmatizer.lemmatize(word) for word in filtered_words]
        return lemmatized_words

In [14]:
class IntentClassifier:
    def __init__(self, max_features=1000):
        self.preprocessor = TextPreprocessor()
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
        self.label_encoder = LabelEncoder()
        self.model = LogisticRegression(max_iter=1000, n_jobs=-1)
        
    def bert_encode(self, texts):
        inputs = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = self.bert_model(**inputs)
            # pooled embedding of shape <1, hidden_size>
        return outputs.last_hidden_state.mean(dim=1).numpy()
    
    def extract_keywords(self, query, top_n=5):
        tfidf_matrix = self.tfidf_vectorizer.transform([query])
        feature_array = np.array(self.tfidf_vectorizer.get_feature_names_out())
        tfidf_sorting = np.argsort(tfidf_matrix.toarray()).flatten()[::-1]
        return feature_array[tfidf_sorting][:top_n].tolist()

    def prepare_data(self, df):
        # Preprocess queries
        queries = [' '.join(self.preprocessor.preprocess(q)) for q in df['query']]
        df['queries'] = queries
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            df['queries'],
            df['intent_category'],
            test_size=0.2,
            random_state=42,
            stratify=df['intent_category']
        )
        return X_train, X_test, y_train, y_test

    def fit(self, X_train, y_train):

        # TF-IDF features
        X_train_tfidf = self.tfidf_vectorizer.fit_transform(X_train)
        
        # BERT features
        X_train_bert = self.bert_encode(X_train.tolist())
        
        # Combine features
        X_train_combined = sp.hstack((X_train_tfidf, X_train_bert))
        
        # Encode labels
        y_train_encoded = self.label_encoder.fit_transform(y_train)
        
        # Train model
        self.model.fit(X_train_combined, y_train_encoded)
        
        return self

    def predict(self, queries):

        processed_queries = [' '.join(self.preprocessor.preprocess(q)) for q in queries]
        
        # Generate features
        query_tfidf = self.tfidf_vectorizer.transform(processed_queries)
        query_bert = self.bert_encode(processed_queries)
        query_combined = sp.hstack((query_tfidf, query_bert))
        
        # Predict
        predictions = self.model.predict(query_combined)
        return self.label_encoder.inverse_transform(predictions)

    def predict_with_keywords(self, query):
        prediction = self.predict([query])[0]
        keywords = self.extract_keywords(query)
        return prediction, keywords

    def evaluate(self, X_test, y_test):
        y_pred = self.predict(X_test)
        return classification_report(y_test, y_pred, target_names=self.label_encoder.classes_)

In [15]:
path = '../data/user_intention.json'

In [16]:
def load_json_data(json_string):

    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    queries = []
    possible_answers = []
    intents = []
    intent_categories = []
    
    for item in data:
        intent = item['intent']
        intent_category = item['intent_category']
        
        for query_item in item['user_queries']:
            queries.append(query_item['query'])
            possible_answers.append(query_item['possible_answers'])
            intents.append(intent)
            intent_categories.append(intent_category)

    
    return pd.DataFrame({
        'query': queries,
        'possible_answers': possible_answers,
        'intent': intents,
        'intent_category': intent_categories
    })

In [17]:
def main():
    # Load DataFrame
    df = load_json_data(path)
    
    classifier = IntentClassifier()
    
    # Prepare data
    X_train, X_test, y_train, y_test = classifier.prepare_data(df)
    
    # Train model
    classifier.fit(X_train, y_train)
    
    # Evaluate
    evaluation_report = classifier.evaluate(X_test, y_test)
    print("Model Evaluation:")
    print(evaluation_report)
    
    
    example_queries = ["What is the best hotel in Japan?"]
    
    for query in example_queries:
        intent, keywords = classifier.predict_with_keywords(query)
        print(f"Query: '{query}'")
        print(f"Predicted Intent: '{intent}'")
        print(f"Keywords: {keywords}\n")

In [18]:
if __name__ == "__main__":
    main()

Model Evaluation:
                      precision    recall  f1-score   support

  CulturalExperience       0.50      1.00      0.67         1
       GeneralTravel       0.00      0.00      0.00         1
   PlanAccommodation       0.00      0.00      0.00         1
        PlanActivity       0.00      0.00      0.00         1
          PlanBudget       0.00      0.00      0.00         1
            PlanCity       0.50      1.00      0.67         1
            PlanFood       1.00      1.00      1.00         1
  PlanTransportation       0.50      1.00      0.67         1
          PlanTravel       0.00      0.00      0.00         1
      SearchActivity       0.00      0.00      0.00         2
   SearchAttractions       0.00      0.00      0.00         1
          SearchCity       0.75      1.00      0.86         3
          SearchFood       1.00      1.00      1.00         3
SearchTransportation       1.00      0.50      0.67         2

            accuracy                           0.5

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
