In [1]:
import pandas as pd
import numpy as np
import nltk
import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Retheck\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Loading the traning data 


In [8]:
train_data = pd.read_csv("C:\\Users\\Retheck\\Downloads\\Bitext_Sample_Customer_Service_Training_Dataset.csv")

print(train_data.head())

  flags                                          utterance category  \
0    BM            I have problems with canceling an order    ORDER   
1   BIM  how can I find information about canceling ord...    ORDER   
2     B          I need help with canceling the last order    ORDER   
3   BIP  could you help me cancelling the last order I ...    ORDER   
4     B            problem with cancelling an order I made    ORDER   

         intent  
0  cancel_order  
1  cancel_order  
2  cancel_order  
3  cancel_order  
4  cancel_order  


# Text preprocessing 

In [9]:
def preprocess_text(text):
    
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Applying the preprocessing function to the 'utterance' column
train_data['utterance_clean'] = train_data['utterance'].apply(preprocess_text)


print(train_data[['utterance', 'utterance_clean']].head())


                                           utterance  \
0            I have problems with canceling an order   
1  how can I find information about canceling ord...   
2          I need help with canceling the last order   
3  could you help me cancelling the last order I ...   
4            problem with cancelling an order I made   

                         utterance_clean  
0               problems canceling order  
1      find information canceling orders  
2         need help canceling last order  
3  could help cancelling last order made  
4          problem cancelling order made  


# Split the Data into Training and Validation Sets

In [12]:
X_train, X_val, y_train, y_val = train_test_split(
    train_data['utterance_clean'], 
    train_data['intent'], 
    test_size=0.2, 
    random_state=42
)


# Vectorization


In [13]:
# Initializing the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fitting the vectorizer on the training utterances and transform both training and validation sets
X_train_vectorized = vectorizer.fit_transform(X_train)
X_val_vectorized = vectorizer.transform(X_val)

# Displaying the shape of the TF-IDF matrices
print(X_train_vectorized.shape, X_val_vectorized.shape)


(6540, 577) (1635, 577)


# Training the model

In [15]:
# Initializing the Logistic Regression model
model = LogisticRegression(max_iter=200)

model.fit(X_train_vectorized, y_train)

print("Model training completed.")


Model training completed.


#  Evaluate the Model on the Validation Set

In [16]:
y_val_pred = model.predict(X_val_vectorized)

# Evaluating the predictions using classification report
print(classification_report(y_val, y_val_pred))

# Displaying some predicted intents along with true labels for validation
validation_results = pd.DataFrame({'utterance': X_val, 'true_intent': y_val, 'predicted_intent': y_val_pred})
print(validation_results.head())


                          precision    recall  f1-score   support

            cancel_order       0.98      0.98      0.98        62
            change_order       1.00      1.00      1.00        70
 change_shipping_address       1.00      0.98      0.99        60
  check_cancellation_fee       0.99      1.00      0.99        66
           check_invoice       1.00      1.00      1.00        63
   check_payment_methods       1.00      0.96      0.98        68
     check_refund_policy       0.98      1.00      0.99        59
               complaint       1.00      1.00      1.00        52
contact_customer_service       0.98      1.00      0.99        61
     contact_human_agent       1.00      1.00      1.00        57
          create_account       0.98      0.94      0.96        62
          delete_account       0.96      1.00      0.98        53
        delivery_options       0.96      1.00      0.98        55
         delivery_period       0.98      1.00      0.99        49
         

# Predict on New User Inputs 

In [17]:
def predict_intent(user_input):
    cleaned_input = preprocess_text(user_input)
    vectorized_input = vectorizer.transform([cleaned_input])
    predicted_intent = model.predict(vectorized_input)[0]
    return predicted_intent

# Testing with an example input
user_input = "I need help with canceling my order"
predicted_intent = predict_intent(user_input)
print(f"Predicted intent: {predicted_intent}")


Predicted intent: cancel_order


#  Fine-tuning the Model

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Loading the data
data = pd.read_csv("C:\\Users\\Retheck\\Downloads\\Bitext_Sample_Customer_Service_Training_Dataset.csv")

# Defining feature (X) and target (y)
X = data['utterance']
y = data['intent']

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a pipeline with TF-IDF and SVM
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', SVC())
])

# Setting up parameters for GridSearch
param_grid = {
    'tfidf__max_df': [0.75, 1.0],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'svm__C': [1, 10, 100],
    'svm__kernel': ['linear', 'rbf']
}

# Performing GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=2)
grid_search.fit(X_train, y_train)

# Output the best parameters
print("Best parameters:", grid_search.best_params_)

# Using the best model for predictions
best_model = grid_search.best_estimator_
y_pred_svm = best_model.predict(X_test)

print(classification_report(y_test, y_pred_svm))


Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END svm__C=1, svm__kernel=linear, tfidf__max_df=0.75, tfidf__ngram_range=(1, 1); total time=   3.1s
[CV] END svm__C=1, svm__kernel=linear, tfidf__max_df=0.75, tfidf__ngram_range=(1, 1); total time=   2.6s
[CV] END svm__C=1, svm__kernel=linear, tfidf__max_df=0.75, tfidf__ngram_range=(1, 1); total time=   2.5s
[CV] END svm__C=1, svm__kernel=linear, tfidf__max_df=0.75, tfidf__ngram_range=(1, 1); total time=   2.6s
[CV] END svm__C=1, svm__kernel=linear, tfidf__max_df=0.75, tfidf__ngram_range=(1, 1); total time=   2.5s
[CV] END svm__C=1, svm__kernel=linear, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2); total time=   5.0s
[CV] END svm__C=1, svm__kernel=linear, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2); total time=   4.9s
[CV] END svm__C=1, svm__kernel=linear, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2); total time=   5.0s
[CV] END svm__C=1, svm__kernel=linear, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2); total time= 

# Expanding Training Data

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Simulated new data for training
additional_data = [
    {"utterance": "I want to cancel my subscription", "intent": "cancel_order"},
    {"utterance": "I would like a refund for my purchase", "intent": "get_refund"},
    # Add more utterances with different intents if needed
]

# Converting additional_data to a DataFrame
additional_df = pd.DataFrame(additional_data)

# Concatenating the new data with the existing data
data_combined = pd.concat([pd.DataFrame({'utterance': X_train, 'intent': y_train}), additional_df], ignore_index=True)

# Redefining feature (X) and target (y) with the combined data
X_combined = data_combined['utterance']
y_combined = data_combined['intent']

# Splitting combined data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

# Defining a pipeline with TF-IDF and SVM (assuming the pipeline is already defined as `pipeline`)
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', SVC())
])

# Re-fitting the model with the new training data
pipeline.fit(X_train, y_train)


#  Adding Intent Confidence

In [22]:
def predict_intent_with_confidence(user_input):
    cleaned_input = preprocess_text(user_input)
    vectorized_input = vectorizer.transform([cleaned_input])
    predicted_intent = model.predict(vectorized_input)[0]
    confidence_scores = model.predict_proba(vectorized_input)
    
    max_confidence = max(confidence_scores[0])
    return predicted_intent, max_confidence

# Testing with an example
user_input = "I need help with canceling my order"
predicted_intent, confidence = predict_intent_with_confidence(user_input)
print(f"Predicted intent: {predicted_intent}, Confidence: {confidence:.2f}")


Predicted intent: cancel_order, Confidence: 0.85
