In [1]:


import pandas as pd
from src.data.ingestion import load_tickets

In [77]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [45]:
json_data = load_tickets("../data/raw/support_tickets.json")

In [46]:
import pandas as pd

df_data = pd.DataFrame([ticket.model_dump() for ticket in json_data[0]])
df_data.head()


Unnamed: 0,ticket_id,created_at,updated_at,customer_id,customer_tier,organization_id,product,product_version,product_module,category,...,escalated,transferred_count,satisfaction_score,resolution_helpful,tags,environment,business_impact,affected_users,language,region
0,TK-2024-000001,2023-11-02 12:30:10+00:00,2023-11-02 15:30:46+00:00,CUST-02387,starter,ORG-234,CloudBackup Enterprise,4.5.10,encryption_layer,Feature Request,...,True,0,4,True,"[error, api, integration, timeout, bug]",production,high,222,de,APAC
1,TK-2024-000002,2023-02-10 16:31:31+00:00,2023-02-12 09:59:43+00:00,CUST-03724,free,ORG-435,DataSync Pro,4.1.11,data_validator,Account Management,...,True,3,4,True,"[database, bug, authentication, data, error]",production,medium,18,ja,MEA
2,TK-2024-000003,2024-09-30 07:43:47+00:00,2024-09-30 11:58:47+00:00,CUST-00600,enterprise,ORG-208,API Gateway,3.1.4,request_router,Feature Request,...,False,3,4,True,"[configuration, error, sync, performance]",staging,medium,591,ja,
3,TK-2024-000004,2024-11-27 18:17:26+00:00,2024-11-30 22:07:50+00:00,CUST-04795,starter,ORG-231,CloudBackup Enterprise,3.4.15,backup_service,Account Management,...,False,2,3,False,"[authentication, api, performance]",production,critical,34,en,LATAM
4,TK-2024-000005,2024-03-09 15:41:02+00:00,2024-03-10 10:53:38+00:00,CUST-01101,starter,ORG-241,StreamProcessor,2.8.8,monitoring,Feature Request,...,False,2,5,True,"[data, integration, security, authentication]",development,medium,325,de,MEA


In [47]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110000 entries, 0 to 109999
Data columns (total 35 columns):
 #   Column              Non-Null Count   Dtype              
---  ------              --------------   -----              
 0   ticket_id           110000 non-null  object             
 1   created_at          110000 non-null  datetime64[ns, UTC]
 2   updated_at          110000 non-null  datetime64[ns, UTC]
 3   customer_id         110000 non-null  object             
 4   customer_tier       110000 non-null  object             
 5   organization_id     110000 non-null  object             
 6   product             110000 non-null  object             
 7   product_version     110000 non-null  object             
 8   product_module      110000 non-null  object             
 9   category            110000 non-null  object             
 10  subcategory         110000 non-null  object             
 11  priority            110000 non-null  object             
 12  severity        

In [48]:
df_data['category'].unique()

array(['Feature Request', 'Account Management', 'Security', 'Data Issue',
       'Technical Issue'], dtype=object)

In [49]:
df_data[df_data['category'].isnull()]

Unnamed: 0,ticket_id,created_at,updated_at,customer_id,customer_tier,organization_id,product,product_version,product_module,category,...,escalated,transferred_count,satisfaction_score,resolution_helpful,tags,environment,business_impact,affected_users,language,region


In [50]:
df_data['category'].value_counts()

category
Security              22085
Data Issue            22050
Feature Request       22047
Account Management    21997
Technical Issue       21821
Name: count, dtype: int64

In [51]:
df_data['subcategory'].unique()

array(['Documentation', 'Upgrade', 'New Feature', 'API', 'Compliance',
       'Corruption', 'Enhancement', 'Sync Error', 'UI/UX',
       'Import/Export', 'Bug', 'Access Control', 'Vulnerability',
       'Authentication', 'Billing', 'Integration', 'Configuration',
       'Performance', 'Encryption', 'License', 'Authorization',
       'Compatibility', 'Subscription', 'Validation', 'Data Loss'],
      dtype=object)

In [65]:
from typing import List, Dict, Any, Tuple

def prepare_classification_dataset(records: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], List[str]]:
    data = []
    for r in records:
        if hasattr(r, "model_dump"):
            r = r.model_dump()
        elif not isinstance(r, dict):
            r = dict(r)
        
        text = f"{r.get('subject','')} \n {r.get('description','')} \n {r.get('error_logs','')}"
        data.append({
            "text": text,
            # a few easy metadata signals
            "product": r.get("product", ""),
            "product_module": r.get("product_module", ""),
            "priority": r.get("priority", ""),
            "channel": r.get("channel", ""),
            "customer_tier": r.get("customer_tier", ""),
            "region": r.get("region", ""),
            "category": r.get("category","")
        })
        
    return data

In [66]:
classification_data= prepare_classification_dataset(json_data[0])
classification_data = pd.DataFrame(classification_data)
classification_data.head()

Unnamed: 0,text,product,product_module,priority,channel,customer_tier,region,category
0,Request: Add bulk operation support to CloudBa...,CloudBackup Enterprise,encryption_layer,critical,portal,starter,APAC,Feature Request
1,License upgrade needed for DataSync Pro \n We ...,DataSync Pro,data_validator,medium,chat,free,MEA,Account Management
2,Request: Add bulk operation support to API Gat...,API Gateway,request_router,high,phone,enterprise,,Feature Request
3,License upgrade needed for CloudBackup Enterpr...,CloudBackup Enterprise,backup_service,low,portal,starter,LATAM,Account Management
4,Request: Add bulk operation support to StreamP...,StreamProcessor,monitoring,high,slack,starter,MEA,Feature Request


In [80]:
classification_data.head()

Unnamed: 0,text,product,product_module,priority,channel,customer_tier,region,category
0,Request: Add bulk operation support to CloudBa...,CloudBackup Enterprise,encryption_layer,critical,portal,starter,APAC,Feature Request
1,License upgrade needed for DataSync Pro \n We ...,DataSync Pro,data_validator,medium,chat,free,MEA,Account Management
2,Request: Add bulk operation support to API Gat...,API Gateway,request_router,high,phone,enterprise,,Feature Request
3,License upgrade needed for CloudBackup Enterpr...,CloudBackup Enterprise,backup_service,low,portal,starter,LATAM,Account Management
4,Request: Add bulk operation support to StreamP...,StreamProcessor,monitoring,high,slack,starter,MEA,Feature Request


In [76]:
for col in ['product', 'product_module', 'priority', 'channel',
       'customer_tier', 'region',]:
    print(f"Unique categories in {col} feature: {classification_data[col].unique()}\n")



Unique categories in product feature: ['CloudBackup Enterprise' 'DataSync Pro' 'API Gateway' 'StreamProcessor'
 'Analytics Dashboard']

Unique categories in product_module feature: ['encryption_layer' 'data_validator' 'request_router' 'backup_service'
 'monitoring' 'batch_processor' 'data_aggregator' 'cache_layer'
 'scheduler' 'compression_engine' 'export_module' 'visualization'
 'rate_limiter' 'auth_service' 'restore_module' 'report_builder'
 'api_connector' 'sync_engine' 'event_handler' 'error_handler']

Unique categories in priority feature: ['critical' 'medium' 'high' 'low']

Unique categories in channel feature: ['portal' 'chat' 'phone' 'slack' 'email' 'api']

Unique categories in customer_tier feature: ['starter' 'free' 'enterprise' 'premium' 'professional']

Unique categories in region feature: ['APAC' 'MEA' 'NA' 'LATAM' 'EU']



In [83]:
# split: train/val/test = 70/15/15 (via 70/30 then 50/50 of remaining)
X_train, X_tmp, y_train, y_tmp = train_test_split(
    classification_data.drop('category',axis=1), classification_data['category'], test_size=0.30, random_state=42, stratify=classification_data['category']
)
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.50, random_state=42, stratify=y_tmp
)

In [84]:
X_train.shape

(77000, 7)

In [89]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

TEXT_COL = "text"
CAT_COLS = [
    "product",
    "product_module",
    "priority",
    "channel",
    "customer_tier",
    "region",
]

preprocess = ColumnTransformer(
    transformers=[
        (
            "text",
            TfidfVectorizer(
                max_features=80_000,
                ngram_range=(1, 2),
                min_df=2,
            ),
            TEXT_COL,   # pandas column name
        ),
        (
            "cat",
            OneHotEncoder(
                handle_unknown="ignore",
                
            ),
            CAT_COLS,   # list of pandas columns
        ),
    ],
    remainder="drop"   # ignore other dataframe columns
)


In [90]:
preprocess_pipe = Pipeline([
    ("preprocess", preprocess)
])

X_transformed = preprocess_pipe.fit_transform(X_train)
X_val_transformed = preprocess_pipe.transform(X_val)
X_test_transformed = preprocess_pipe.transform(X_test)



In [95]:
X_train.shape

(77000, 7)

In [94]:
print(X_transformed.shape)


(77000, 29916)


In [93]:
preprocess_pipe.get_feature_names_out

<bound method Pipeline.get_feature_names_out of Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('text',
                                                  TfidfVectorizer(max_features=80000,
                                                                  min_df=2,
                                                                  ngram_range=(1,
                                                                               2)),
                                                  'text'),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['product', 'product_module',
                                                   'priority', 'channel',
                                                   'customer_tier',
                                                   'region'])]))])>

In [98]:
# logistric_model = LogisticRegression(
#         max_iter=2000,
#         n_jobs=None,          # keep portable across OS
#         class_weight="balanced"  # helps if categories are imbalanced
#     )

# logistric_model.fit(X=X_transformed,y=y_train)
# # Validation
# val_pred = logistric_model.predict(X_val_transformed)
# val_f1 = f1_score(y_val, val_pred, average="weighted")
# print("\n--- VALIDATION ---")
# print("Weighted F1:", round(val_f1, 4))
# print(classification_report(y_val, val_pred, digits=3))

# # Test
# test_pred = logistric_model.predict(X_test_transformed)
# test_f1 = f1_score(y_test, test_pred, average="weighted")
# print("\n--- TEST ---")
# print("Weighted F1:", round(test_f1, 4))
# print(classification_report(y_test, test_pred, digits=3))
