In [2]:
# install necessary libraries (assuming the initial block has been run)
# !pip install scikit-learn
# !pip install pandas
# !pip install nltk

# download necessary nltk resources (assuming the initial block has been run)
# import nltk
# nltk.download('punkt')
# nltk.download('punkt_tab')
# nltk.download('wordnet')
# nltk.download('stopwords')

print("NLTK Resources downloaded successfully!")

# Import necessary libraries
import pandas as pd
import numpy as np
import json
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# üåü ÊõøÊç¢ÔºöÂØºÂÖ•ÈÄªËæëÂõûÂΩí (Logistic Regression)
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import classification_report, accuracy_score
from joblib import dump,load

# Import NLTK
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

# Initialize NLTK resources
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Placeholder: Load the actual dataset. Ensure it has 'text' (user query) and 'intent' (label) columns
df = pd.read_csv('dataset.csv')

# shuffle the data for robust splitting
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
print(df.head())

def preprocess_text(text):
    # 1. Convert to Lowercase
    text = text.lower()
    
    # 2. Remove Punctuation and Special Characters
    text = re.sub(r'[^\w\s]', '', text)
    
    # 3. Tokenization
    tokens = word_tokenize(text)
    
    # 4. Stopword Removal
    tokens = [word for word in tokens if word not in stop_words]
    
    # 5. Lemmatization (Key Enhancement)
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Rejoin tokens into a single string
    return ' '.join(tokens)

# Apply the new preprocessing function to the text column
df['cleaned_text'] = df['text'].apply(preprocess_text)
print("--- Preprocessing Complete (with NLTK Lemmatization) ---")
print(df[['text', 'cleaned_text']].head())

# Prepare the cleaned text and intents for the model training section
X = df['cleaned_text']
y = df['intent']

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the text data to create the feature matrix X
X = vectorizer.fit_transform(X)
y = df['intent']

print(f"Feature matrix X shape: {X.shape}")
print(f"Labels y shape: {y.shape}")

X_train, X_test, y_train, y_test = train_test_split(
    X,y,
    test_size=0.2,
    random_state=42,
    stratify=y if len(df['intent'].unique()) > 1 else None, # ‰ªÖÂú®ÊúâÂ§ö‰∏™Á±ªÂà´Êó∂ÂàÜÂ±Ç
)

print(f"Train set size:{X_train.shape[0]} samples")
print(f"Test set size:{X_test.shape[0]} samples")

# üåü ÊõøÊç¢ÔºöÂÆû‰æãÂåñÈÄªËæëÂõûÂΩí (Logistic Regression) Ê®°Âûã
# C: Ê≠£ÂàôÂåñÂèÇÊï∞ÁöÑÂÄíÊï∞ (ÊÉ©ÁΩöÈ°πÔºåË∂äÂ∞èÊ≠£ÂàôÂåñË∂äÂº∫)
# solver='liblinear' Âú®Â§ÑÁêÜL1/L2ÊÉ©ÁΩö‰∏îÊï∞ÊçÆÈõÜÁõ∏ÂØπËæÉÂ∞èÊó∂Ë°®Áé∞ËâØÂ•Ω
# multi_class='ovr' Áî®‰∫éÂ§öÂàÜÁ±ª‰ªªÂä°
lr_model = LogisticRegression(C=1.0, solver='liblinear', multi_class='ovr', random_state=42)

# ËÆ≠ÁªÉÊ®°Âûã
lr_model.fit(X_train, y_train)

# Make predictions on the test set
pred = lr_model.predict(X_test)

print("--- Logistic Regression Model Evaluation ---")
print("Accuracy:", accuracy_score(y_test, pred))
print("\nClassification Report:\n")
if len(y_test.unique()) > 1:
    print(classification_report(y_test, pred, zero_division=0))
else:
    print("Classification Report skipped: Only one class in test set.")

# üåü ÊõøÊç¢Ôºö‰øùÂ≠òËÆ≠ÁªÉÂ•ΩÁöÑ Logistic Regression Ê®°ÂûãÂíå Vectorizer
dump(lr_model, 'logistic_regression_intent_model.joblib')
dump(vectorizer, 'tfidf_vectorizer_LR.joblib')
print("Model and Vectorizer saved using joblib.")

print("\n--- Loading Responses from JSON ---")
try:
    with open('response.json', 'r', encoding='utf-8') as f:
        responses = json.load(f)
    print(f"Successfully loaded {len(responses)} intent-response pairs.")
except FileNotFoundError:
    print("Error: response.json file not found.")
    responses = {}

# --- 4. Chatbot Function (Updated) ---
def chatbot_reply_lr(user_input, model, vectorizer, responses):
    # 1. Preprocessing
    # Critical: Must use the same preprocessing function as training
    user_input_cleaned = preprocess_text(user_input)
    
    # 2. Feature Extraction: Transform the input using the fitted vectorizer
    # Note: transform() returns a sparse matrix, which LogisticRegression handles natively
    vector = vectorizer.transform([user_input_cleaned])

    # 3. Intent Prediction
    # predict() returns an array, we take the first element [0]
    intent = model.predict(vector)[0]

    # 4. Retrieval (Check for unknown intent/fallback)
    # If the predicted intent exists in the dictionary, return the specific response
    # Otherwise, return a fallback message
    return responses.get(intent, f"Sorry, I predicted the intent '{intent}', but I don't have a specific response for that yet. Please rephrase your question.")

# --- 5. Test the Chatbot ---
print("\n--- Logistic Regression Chatbot Test ---")
test_input = "Can I get a room here?"
predicted_response = chatbot_reply_lr(test_input, lr_model, vectorizer, responses)

print(f"User Input: {test_input}")
print(f"Chatbot Reply: {predicted_response}")

NLTK Resources downloaded successfully!
                                text             intent
0                       Good evening           greeting
1           When does check-out end?  ask_checkout_time
2  What is your cancellation policy?   ask_cancellation
3      What is your check-in policy?   ask_checkin_time
4          When does check-in start?   ask_checkin_time
--- Preprocessing Complete (with NLTK Lemmatization) ---
                                text         cleaned_text
0                       Good evening         good evening
1           When does check-out end?         checkout end
2  What is your cancellation policy?  cancellation policy
3      What is your check-in policy?       checkin policy
4          When does check-in start?        checkin start
Feature matrix X shape: (100, 105)
Labels y shape: (100,)
Train set size:80 samples
Test set size:20 samples
--- Logistic Regression Model Evaluation ---
Accuracy: 0.75

Classification Report:

                   precis

