In [4]:
import pandas as pd
import numpy as np
import re
import nltk
import spacy
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import json

#Data Preparation

In [10]:
#Reading the excel data
df=pd.read_excel('/content/Vijaya Assisgnment Dataset - Copy.xlsx')

In [11]:
df

Unnamed: 0,ticket_id,ticket_text,issue_type,urgency_level,product
0,1,Payment issue for my SmartWatch V2. I was unde...,Billing Problem,Medium,SmartWatch V2
1,2,Can you tell me more about the UltraClean Vacu...,General Inquiry,,UltraClean Vacuum
2,3,I ordered SoundWave 300 but got EcoBreeze AC i...,Wrong Item,Medium,SoundWave 300
3,4,Facing installation issue with PhotoSnap Cam. ...,Installation Issue,Low,PhotoSnap Cam
4,5,Order #30903 for Vision LED TV is 13 days late...,Late Delivery,,Vision LED TV
...,...,...,...,...,...
995,996,I ordered EcoBreeze AC but got FitRun Treadmil...,Wrong Item,High,EcoBreeze AC
996,997,I ordered SoundWave 300 but got PowerMax Batte...,Wrong Item,Low,SoundWave 300
997,998,,Installation Issue,Medium,EcoBreeze AC
998,999,Payment issue fr mi SoundWave 300. I was debit...,Billing Problem,Low,SoundWave 300


In [12]:
df.shape

(1000, 5)

In [13]:
#Finding missing values
df.isnull().sum()

Unnamed: 0,0
ticket_id,0
ticket_text,55
issue_type,76
urgency_level,52
product,0


In [14]:
#Deleting missing values
df.dropna(inplace=True)

In [15]:
df.shape

(826, 5)

In [16]:
nltk.download('stopwords')
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [17]:
#Function to preprocess text
def preprocess_text (text):
  text=text.lower()
  text = re.sub(r'[^a-zA-Z\s]', '', text)
  doc = nlp(text)
  tokens = [token.lemma_ for token in doc if token.text not in stopwords.words('english') and token.is_alpha]

  return ' '.join(tokens)
  df['ticket_text'] = df['ticket_text'].apply(preprocess_text)
  print(df[['ticket_id', 'ticket_text', 'issue_type', 'urgency_level', 'product']].head())


In [18]:
df[['ticket_id', 'ticket_text', 'issue_type', 'urgency_level', 'product']].head()

Unnamed: 0,ticket_id,ticket_text,issue_type,urgency_level,product
0,1,Payment issue for my SmartWatch V2. I was unde...,Billing Problem,Medium,SmartWatch V2
2,3,I ordered SoundWave 300 but got EcoBreeze AC i...,Wrong Item,Medium,SoundWave 300
3,4,Facing installation issue with PhotoSnap Cam. ...,Installation Issue,Low,PhotoSnap Cam
5,6,Can you tell me more about the PhotoSnap Cam w...,General Inquiry,Medium,PhotoSnap Cam
6,7,is malfunction. It stopped working after just...,Product Defect,Low,EcoBreeze AC


In [19]:
df.head()

Unnamed: 0,ticket_id,ticket_text,issue_type,urgency_level,product
0,1,Payment issue for my SmartWatch V2. I was unde...,Billing Problem,Medium,SmartWatch V2
2,3,I ordered SoundWave 300 but got EcoBreeze AC i...,Wrong Item,Medium,SoundWave 300
3,4,Facing installation issue with PhotoSnap Cam. ...,Installation Issue,Low,PhotoSnap Cam
5,6,Can you tell me more about the PhotoSnap Cam w...,General Inquiry,Medium,PhotoSnap Cam
6,7,is malfunction. It stopped working after just...,Product Defect,Low,EcoBreeze AC


#Feature Engineering

In [20]:
# Feature 1: Ticket Length
df['ticket_length'] = df['ticket_text'].apply(lambda x: len(x.split()))

# Feature 2: Average Word Length
df['avg_word_length'] = df['ticket_text'].apply(lambda x: np.mean([len(word) for word in x.split()]) if x else 0)

# Feature 3: Sentiment Score
df['sentiment_score'] = df['ticket_text'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Feature 4: TF-IDF Vectors
tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))  # 1-gram & 2-gram
tfidf_matrix = tfidf.fit_transform(df['ticket_text'])

# Convert TF-IDF matrix to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

# Combine all features
final_features_df = pd.concat([
    df[['ticket_length', 'avg_word_length', 'sentiment_score']].reset_index(drop=True),
    tfidf_df.reset_index(drop=True)
], axis=1)

# Preview
print(final_features_df.head())

   ticket_length  avg_word_length  sentiment_score   01  01 april   02  \
0             12         5.000000              0.0  0.0       0.0  0.0   
1             14         4.785714              0.0  0.0       0.0  0.0   
2             11         5.272727             -0.5  0.0       0.0  0.0   
3             16         4.312500              0.3  0.0       0.0  0.0   
4              9         5.000000              0.0  0.0       0.0  0.0   

   02 april  02 march  02 may   03  ...  x1 and  x1 are  x1 but  x1 instead  \
0       0.0       0.0     0.0  0.0  ...     0.0     0.0     0.0         0.0   
1       0.0       0.0     0.0  0.0  ...     0.0     0.0     0.0         0.0   
2       0.0       0.0     0.0  0.0  ...     0.0     0.0     0.0         0.0   
3       0.0       0.0     0.0  0.0  ...     0.0     0.0     0.0         0.0   
4       0.0       0.0     0.0  0.0  ...     0.0     0.0     0.0         0.0   

   x1 is  x1 setup  x1 warranty  x1 was       you  you tell  
0    0.0       0.0

#Multi-Task Learning

In [21]:
tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
tfidf_matrix = tfidf.fit_transform(df['ticket_text'])

# Combine all features
extra_features = df[['ticket_length', 'avg_word_length', 'sentiment_score']].values
from scipy.sparse import hstack
X = hstack([tfidf_matrix, extra_features])

In [22]:
le_issue = LabelEncoder()
y_issue = le_issue.fit_transform(df['issue_type'])

le_urgency = LabelEncoder()
y_urgency = le_urgency.fit_transform(df['urgency_level'])

In [23]:
X_train1, X_test1, y_train_issue, y_test_issue = train_test_split(X, y_issue, test_size=0.2, random_state=42)
X_train2, X_test2, y_train_urgency, y_test_urgency = train_test_split(X, y_urgency, test_size=0.2, random_state=42)

In [24]:
# Model 1: Issue Type Classifier
model_issue = LogisticRegression(max_iter=1000)
model_issue.fit(X_train1, y_train_issue)
y_pred_issue = model_issue.predict(X_test1)

# Model 2: Urgency Level Classifier
model_urgency = LogisticRegression(max_iter=1000)
model_urgency.fit(X_train2, y_train_urgency)
y_pred_urgency = model_urgency.predict(X_test2)

In [25]:
#Evaluation
print("Classification Report - Issue Type:")
print(classification_report(y_test_issue, y_pred_issue, target_names=le_issue.classes_))

print("\nClassification Report - Urgency Level:")
print(classification_report(y_test_urgency, y_pred_urgency, target_names=le_urgency.classes_))

Classification Report - Issue Type:
                    precision    recall  f1-score   support

    Account Access       1.00      1.00      1.00        23
   Billing Problem       1.00      1.00      1.00        19
   General Inquiry       1.00      1.00      1.00        25
Installation Issue       1.00      1.00      1.00        29
     Late Delivery       1.00      1.00      1.00        17
    Product Defect       1.00      1.00      1.00        30
        Wrong Item       1.00      1.00      1.00        23

          accuracy                           1.00       166
         macro avg       1.00      1.00      1.00       166
      weighted avg       1.00      1.00      1.00       166


Classification Report - Urgency Level:
              precision    recall  f1-score   support

        High       0.34      0.30      0.32        66
         Low       0.28      0.30      0.29        43
      Medium       0.35      0.37      0.36        57

    accuracy                           0.33

In [26]:
#Accuracy Score
from sklearn.metrics import classification_report, accuracy_score

# Evaluate Issue Type Classifier
print("Classification Report - Issue Type:")
print(classification_report(y_test_issue, y_pred_issue, target_names=le_issue.classes_))
print("Accuracy - Issue Type:", accuracy_score(y_test_issue, y_pred_issue))

# Evaluate Urgency Level Classifier
print("\nClassification Report - Urgency Level:")
print(classification_report(y_test_urgency, y_pred_urgency, target_names=le_urgency.classes_))
print("Accuracy - Urgency Level:", accuracy_score(y_test_urgency, y_pred_urgency))

Classification Report - Issue Type:
                    precision    recall  f1-score   support

    Account Access       1.00      1.00      1.00        23
   Billing Problem       1.00      1.00      1.00        19
   General Inquiry       1.00      1.00      1.00        25
Installation Issue       1.00      1.00      1.00        29
     Late Delivery       1.00      1.00      1.00        17
    Product Defect       1.00      1.00      1.00        30
        Wrong Item       1.00      1.00      1.00        23

          accuracy                           1.00       166
         macro avg       1.00      1.00      1.00       166
      weighted avg       1.00      1.00      1.00       166

Accuracy - Issue Type: 1.0

Classification Report - Urgency Level:
              precision    recall  f1-score   support

        High       0.34      0.30      0.32        66
         Low       0.28      0.30      0.29        43
      Medium       0.35      0.37      0.36        57

    accuracy    

In [27]:
#Now try urgency level with different model because our accuracy score is less
from sklearn.ensemble import RandomForestClassifier

model_urgency = RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42)
model_urgency.fit(X_train2, y_train_urgency)

In [28]:
y_pred_urgency = model_urgency.predict(X_test2)
print("Accuracy:", accuracy_score(y_test_urgency, y_pred_urgency))
print(classification_report(y_test_urgency, y_pred_urgency, target_names=le_urgency.classes_))

Accuracy: 0.35542168674698793
              precision    recall  f1-score   support

        High       0.38      0.35      0.37        66
         Low       0.29      0.33      0.30        43
      Medium       0.39      0.39      0.39        57

    accuracy                           0.36       166
   macro avg       0.35      0.35      0.35       166
weighted avg       0.36      0.36      0.36       166



Entity Extraction

In [29]:
product_list = df['product'].dropna().unique().tolist()
complaint_keywords = ['broken', 'late', 'error', 'malfunction', 'issue', 'not working']

def extract_entities(text):
    doc = nlp(text)
    products = [prod for prod in product_list if prod.lower() in text.lower()]
    dates = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
    complaints = [word for word in complaint_keywords if word in text.lower()]
    return {"products": products, "dates": dates, "complaints": complaints}

df['entities'] = df['ticket_text'].astype(str).apply(extract_entities)
print(df[['ticket_id', 'ticket_text', 'entities']].head())

   ticket_id                                        ticket_text  \
0          1  Payment issue for my SmartWatch V2. I was unde...   
2          3  I ordered SoundWave 300 but got EcoBreeze AC i...   
3          4  Facing installation issue with PhotoSnap Cam. ...   
5          6  Can you tell me more about the PhotoSnap Cam w...   
6          7   is malfunction. It stopped working after just...   

                                            entities  
0  {'products': ['SmartWatch V2'], 'dates': [], '...  
2  {'products': ['SoundWave 300', 'EcoBreeze AC']...  
3  {'products': ['PhotoSnap Cam'], 'dates': [], '...  
5  {'products': ['PhotoSnap Cam'], 'dates': [], '...  
6  {'products': [], 'dates': ['just 7 days'], 'co...  


Integration

In [30]:
def analyze_ticket(ticket_text, issue_type_model, urgency_level_model, vectorizer):
    clean_text = preprocess_text(ticket_text)
    vectorized = vectorizer.transform([clean_text])

    # Predictions
    issue_pred = issue_type_model.predict(vectorized)[0]
    urgency_pred = urgency_level_model.predict(vectorized)[0]

    # Entities
    entities = extract_entities(ticket_text)

    return {
        "issue_type": issue_pred,
        "urgency_level": urgency_pred,
        "entities": entities
    }

#Gradio Interface

In [33]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.31.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.1 (from gradio)
  Downloading gradio_client-1.10.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.

In [34]:
import gradio as gr

# Placeholder function for issue type prediction
def predict_issue_type(text):
    # Replace with your model logic
    if "password" in text.lower():
        return "Password Issue"
    elif "refund" in text.lower():
        return "Refund Request"
    else:
        return "General Inquiry"

# Placeholder function for urgency prediction
def predict_urgency(text):
    # Replace with your model logic
    if "urgent" in text.lower() or "asap" in text.lower():
        return "High"
    else:
        return "Normal"

# Placeholder function for entity extraction
def extract_entities(text):
    # Replace with your entity extraction logic
    entities = []
    words = text.split()
    for w in words:
        if w.istitle():
            entities.append(w)
    return ", ".join(entities) if entities else "No entities found"

# Combined function for Gradio interface
def analyze_ticket(text):
    issue_type = predict_issue_type(text)
    urgency = predict_urgency(text)
    entities = extract_entities(text)
    return issue_type, urgency, entities

# Build Gradio interface
iface = gr.Interface(
    fn=analyze_ticket,
    inputs=gr.Textbox(lines=5, placeholder="Enter ticket text here..."),
    outputs=[
        gr.Textbox(label="Predicted Issue Type"),
        gr.Textbox(label="Predicted Urgency"),
        gr.Textbox(label="Extracted Entities"),
    ],
    title="Ticket Analyzer",
    description="Input raw ticket text to predict issue type, urgency, and extract entities."
)

iface.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://c06078fc5af583078f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


