In [74]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import string
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizer, DistilBertModel
import scipy.sparse as sp
import joblib
from tqdm import tqdm

In [75]:
class TextPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
    
    def preprocess(self, text):
        text = ''.join(char.lower() for char in text if char not in string.punctuation)
        filtered_words = [word for word in text.split() if word not in self.stop_words]
        lemmatized_words = [self.lemmatizer.lemmatize(word) for word in filtered_words]
        return lemmatized_words



In [76]:
class IntentDataset(Dataset):
    def __init__(self, features, labels=None):
        self.features = torch.FloatTensor(features.toarray() if sp.issparse(features) else features)
        self.labels = torch.LongTensor(labels) if labels is not None else None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.labels is not None:
            return self.features[idx], self.labels[idx]
        return self.features[idx]



In [77]:
class IntentClassifierNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(IntentClassifierNN, self).__init__()
        
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.layer2 = nn.Linear(hidden_size, hidden_size // 2)
        self.layer3 = nn.Linear(hidden_size // 2, num_classes)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.batch_norm1 = nn.LayerNorm(hidden_size)
        self.batch_norm2 = nn.LayerNorm(hidden_size // 2)

    def forward(self, x):
        x = self.dropout(self.relu(self.batch_norm1(self.layer1(x))))
        x = self.dropout(self.relu(self.batch_norm2(self.layer2(x))))
        x = self.layer3(x)
        return x



In [78]:
class DeepIntentClassifier:
    def __init__(self, max_features=1000, hidden_size=256, batch_size=32, num_epochs=10, learning_rate=0.001, accumulation_steps=1):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.preprocessor = TextPreprocessor()
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
        self.label_encoder = LabelEncoder()
        self.max_features = max_features
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.learning_rate = learning_rate
        self.accumulation_steps = accumulation_steps 
        self.model = None
        self.is_fitted = False

    def bert_encode(self, texts):
        inputs = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = self.bert_model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).numpy()

    def extract_keywords(self, query, top_n=5):
        tfidf_matrix = self.tfidf_vectorizer.transform([query])
        feature_array = np.array(self.tfidf_vectorizer.get_feature_names_out())
        tfidf_sorting = np.argsort(tfidf_matrix.toarray()).flatten()[::-1]
        return feature_array[tfidf_sorting][:top_n].tolist()

    def prepare_data(self, df):
        try:
            if 'query' not in df.columns:
                raise ValueError("DataFrame must contain a 'query' column")
            if 'intent_category' not in df.columns:
                raise ValueError("DataFrame must contain an 'intent_category' column")
            
            processed_queries = [' '.join(self.preprocessor.preprocess(q)) for q in df['query']]
            class_counts = df['intent_category'].value_counts()
            min_samples = class_counts.min()
            
            print(f"Class distribution:\n{class_counts}")
            print(f"Minimum samples per class: {min_samples}")
            
            if min_samples >= 2:
                X_train, X_test, y_train, y_test = train_test_split(
                    processed_queries,
                    df['intent_category'],
                    test_size=0.2,
                    random_state=42,
                    stratify=df['intent_category']
                )
                print("Using stratified split")
            else:
                X_train, X_test, y_train, y_test = train_test_split(
                    processed_queries,
                    df['intent_category'],
                    test_size=0.2,
                    random_state=42
                )
                print("Using regular split (not stratified)")
            
            self.tfidf_vectorizer.fit(X_train)
            return X_train, X_test, y_train, y_test
        
        except Exception as e:
            print(f"Error in data preparation: {str(e)}")
            raise

    def create_features(self, X):
        X_tfidf = self.tfidf_vectorizer.transform(X)
        X_bert = self.bert_encode(X)
        return sp.hstack((X_tfidf, X_bert))

    def fit(self, X_train, y_train):
        try:
            if len(X_train) < self.batch_size:
                self.batch_size = max(2, len(X_train))  # Ensure batch size is at least 2
                print(f"Batch size adjusted to {self.batch_size} due to small dataset")
            
            X_train_combined = self.create_features(X_train)
            y_train_encoded = self.label_encoder.fit_transform(y_train)
            
            input_size = X_train_combined.shape[1]
            num_classes = len(self.label_encoder.classes_)
            self.model = IntentClassifierNN(input_size, self.hidden_size, num_classes).to(self.device)
            
            train_dataset = IntentDataset(X_train_combined, y_train_encoded)
            train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
            
            criterion = nn.CrossEntropyLoss()
            optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
            
            for epoch in range(self.num_epochs):
                self.model.train()
                total_loss = 0
                optimizer.zero_grad()
                progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{self.num_epochs}')
                
                for i, (batch_features, batch_labels) in enumerate(progress_bar):
                    batch_features = batch_features.to(self.device)
                    batch_labels = batch_labels.to(self.device)
                    
                    outputs = self.model(batch_features)
                    loss = criterion(outputs, batch_labels)
                    loss.backward()
                    
                    
                    if (i + 1) % self.accumulation_steps == 0 or (i + 1) == len(train_loader):
                        optimizer.step()
                        optimizer.zero_grad()
                    
                    total_loss += loss.item()
                    progress_bar.set_postfix({'loss': total_loss / len(train_loader)})
            
            self.is_fitted = True
            return self
        
        except Exception as e:
            print(f"Error during model fitting: {str(e)}")
            raise

    def predict(self, queries):
        self.model.eval()
        processed_queries = [' '.join(self.preprocessor.preprocess(q)) for q in queries]
        features = self.create_features(processed_queries)
        dataset = IntentDataset(features)
        dataloader = DataLoader(dataset, batch_size=self.batch_size)
        
        predictions = []
        with torch.no_grad():
            for batch_features in dataloader:
                batch_features = batch_features.to(self.device)
                outputs = self.model(batch_features)
                _, predicted = torch.max(outputs, 1)
                predictions.extend(predicted.cpu().numpy())
        
        return self.label_encoder.inverse_transform(predictions)
    
    def predict_with_keywords(self, query):
        prediction = self.predict([query])[0]
        keywords = self.extract_keywords(query)
        return prediction, keywords

    def predict_with_confidence(self, queries):
        self.model.eval()
        processed_queries = [' '.join(self.preprocessor.preprocess(q)) for q in queries]
        features = self.create_features(processed_queries)
        dataset = IntentDataset(features)
        dataloader = DataLoader(dataset, batch_size=self.batch_size)
        
        all_probabilities = []
        with torch.no_grad():
            for batch_features in dataloader:
                batch_features = batch_features.to(self.device)
                outputs = self.model(batch_features)
                probabilities = torch.softmax(outputs, dim=1)
                all_probabilities.extend(probabilities.cpu().numpy())
        
        predictions = np.argmax(all_probabilities, axis=1)
        confidences = np.max(all_probabilities, axis=1)
        predicted_intents = self.label_encoder.inverse_transform(predictions)
        
        return list(zip(predicted_intents, confidences))

    def evaluate(self, X_test, y_test):
        y_pred = self.predict(X_test)
        return classification_report(y_test, y_pred, target_names=self.label_encoder.classes_)

    # def save_model(self, filepath):
    #     model_state = {
    #         'model_state_dict': self.model.state_dict(),
    #         'tfidf_vectorizer': self.tfidf_vectorizer,
    #         'label_encoder': self.label_encoder,
    #     }
    #     torch.save(model_state, filepath)

    def load_model(self, filepath):
        model_state = torch.load(filepath)
        self.tfidf_vectorizer = model_state['tfidf_vectorizer']
        self.label_encoder = model_state['label_encoder']
        
        input_size = self.tfidf_vectorizer.max_features + 768  # BERT features size
        num_classes = len(self.label_encoder.classes_)
        self.model = IntentClassifierNN(input_size, self.hidden_size, num_classes).to(self.device)
        self.model.load_state_dict(model_state['model_state_dict'])
        return self


In [79]:
import json
path = '../data/user_intention.json'

In [80]:
def load_json_data(json_string):

    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    queries = []
    possible_answers = []
    intents = []
    intent_categories = []
    
    for item in data:
        intent = item['intent']
        intent_category = item['intent_category']
        
        for query_item in item['user_queries']:
            queries.append(query_item['query'])
            possible_answers.append(query_item['possible_answers'])
            intents.append(intent)
            intent_categories.append(intent_category)

    
    return pd.DataFrame({
        'query': queries,
        'possible_answers': possible_answers,
        'intent': intents,
        'intent_category': intent_categories
    })

In [81]:
def main():
    try:
        
        df = load_json_data(path)
        
        classifier = DeepIntentClassifier(
            max_features=1000,
            hidden_size=256,
            batch_size=2,  
            num_epochs=100   
        )
        
        # Prepare data
        X_train, X_test, y_train, y_test = classifier.prepare_data(df)
        
        # Train model
        classifier.fit(X_train, y_train)
        
        # Evaluate
        evaluation_report = classifier.evaluate(X_test, y_test)
        print("Model Evaluation:")
        print(evaluation_report)
        
     
        example_queries = ["What is the best hotel in Japan?"]
        predictions = classifier.predict(example_queries)
        
        for query, intent in zip(example_queries, predictions):
            keywords = classifier.predict_with_keywords(query)
            print(f"Query: '{query}'")
            print(f"Predicted Intent: '{intent}'")
            print(f"Keywords: {keywords}\n")

    except Exception as e:
        print(f"An error occurred: {str(e)}")

In [82]:
if __name__ == "__main__":
    main()

Class distribution:
intent_category
SearchCity              13
SearchFood              12
SearchTransportation     7
SearchActivity           7
PlanCity                 7
PlanFood                 7
PlanTransportation       7
PlanActivity             7
SearchAttractions        6
PlanTravel               6
PlanAccommodation        5
PlanBudget               5
CulturalExperience       5
GeneralTravel            5
Name: count, dtype: int64
Minimum samples per class: 5
Using stratified split


Epoch 1/100: 100%|██████████| 40/40 [00:00<00:00, 273.01it/s, loss=2.76]
Epoch 2/100: 100%|██████████| 40/40 [00:00<00:00, 282.88it/s, loss=2.59]
Epoch 3/100: 100%|██████████| 40/40 [00:00<00:00, 283.75it/s, loss=2.27]
Epoch 4/100: 100%|██████████| 40/40 [00:00<00:00, 270.36it/s, loss=2.06]
Epoch 5/100: 100%|██████████| 40/40 [00:00<00:00, 279.72it/s, loss=1.78]
Epoch 6/100: 100%|██████████| 40/40 [00:00<00:00, 273.89it/s, loss=1.59]
Epoch 7/100: 100%|██████████| 40/40 [00:00<00:00, 280.64it/s, loss=1.42] 
Epoch 8/100: 100%|██████████| 40/40 [00:00<00:00, 284.77it/s, loss=1.15] 
Epoch 9/100: 100%|██████████| 40/40 [00:00<00:00, 276.12it/s, loss=0.987]
Epoch 10/100: 100%|██████████| 40/40 [00:00<00:00, 275.62it/s, loss=0.802]
Epoch 11/100: 100%|██████████| 40/40 [00:00<00:00, 109.03it/s, loss=0.786]
Epoch 12/100: 100%|██████████| 40/40 [00:00<00:00, 145.71it/s, loss=0.58] 
Epoch 13/100: 100%|██████████| 40/40 [00:00<00:00, 219.44it/s, loss=0.618]
Epoch 14/100: 100%|██████████| 40/40 [00

Model Evaluation:
                      precision    recall  f1-score   support

  CulturalExperience       0.50      1.00      0.67         1
       GeneralTravel       0.00      0.00      0.00         1
   PlanAccommodation       0.00      0.00      0.00         1
        PlanActivity       0.00      0.00      0.00         1
          PlanBudget       0.00      0.00      0.00         1
            PlanCity       0.00      0.00      0.00         1
            PlanFood       1.00      1.00      1.00         1
  PlanTransportation       0.50      1.00      0.67         1
          PlanTravel       0.00      0.00      0.00         1
      SearchActivity       0.00      0.00      0.00         2
   SearchAttractions       0.25      1.00      0.40         1
          SearchCity       1.00      0.67      0.80         3
          SearchFood       1.00      1.00      1.00         3
SearchTransportation       0.00      0.00      0.00         2

            accuracy                           0.4