In [77]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib


In [78]:
df=pd.read_csv('citizen_complaints.csv')
df

Unnamed: 0,ID,complaint_text,category,priority
0,1,There's been no water supply in our colony sin...,Water,High
1,2,Street lights on MG Road are not working for 3...,Electricity,Medium
2,3,Garbage has not been collected from Sector 7 f...,Sanitation,High
3,4,Water pressure is too low during mornings in G...,Water,Medium
4,5,Broken electric pole leaning dangerously near ...,Electricity,High
...,...,...,...,...
100,101,Traffic signals not working at major intersect...,Road Safety,High
101,102,Traffic signals not working at major intersect...,Road Safety,Medium
102,103,Traffic signals not working at major intersect...,Road Safety,Medium
103,104,Potholes on main highway causing accidents.,Road Safety,Medium


In [79]:
df.isna().sum()

ID                0
complaint_text    0
category          0
priority          0
dtype: int64

In [80]:
df=df[df['complaint_text'].notna()]
df

Unnamed: 0,ID,complaint_text,category,priority
0,1,There's been no water supply in our colony sin...,Water,High
1,2,Street lights on MG Road are not working for 3...,Electricity,Medium
2,3,Garbage has not been collected from Sector 7 f...,Sanitation,High
3,4,Water pressure is too low during mornings in G...,Water,Medium
4,5,Broken electric pole leaning dangerously near ...,Electricity,High
...,...,...,...,...
100,101,Traffic signals not working at major intersect...,Road Safety,High
101,102,Traffic signals not working at major intersect...,Road Safety,Medium
102,103,Traffic signals not working at major intersect...,Road Safety,Medium
103,104,Potholes on main highway causing accidents.,Road Safety,Medium


In [81]:
embedder=SentenceTransformer('all-miniLM-L6-v2')

In [82]:
def get_embeddings(texts):
    return embedder.encode(texts)

In [83]:
df['category_label']=df['category'].astype('category').cat.codes
cat_label_map=dict(enumerate(df['category'].astype('category').cat.categories))
cat_label_map                   

{0: 'Electricity',
 1: 'Noise Pollution',
 2: 'Road Safety',
 3: 'Sanitation',
 4: 'Water'}

In [84]:
X=get_embeddings(df['complaint_text'].tolist())
y=df['category_label']

In [85]:
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.2, random_state=42)

In [100]:
model_cat=LogisticRegression(max_iter=1000)
model_cat.fit(X_train,y_train)
y_pred_cat=model_cat.predict(X_test)
print("Category Accuracy:", accuracy_score(y_test,y_pred_cat))
print(classification_report(y_test,y_pred_cat,target_names=cat_label_map.values()))

Category Accuracy: 0.9523809523809523
                 precision    recall  f1-score   support

    Electricity       1.00      1.00      1.00         5
Noise Pollution       1.00      1.00      1.00         3
    Road Safety       1.00      1.00      1.00         4
     Sanitation       0.80      1.00      0.89         4
          Water       1.00      0.80      0.89         5

       accuracy                           0.95        21
      macro avg       0.96      0.96      0.96        21
   weighted avg       0.96      0.95      0.95        21



In [101]:
df['priority_label']= df['priority'].astype('category').cat.codes
pri_label_map=dict(enumerate(df['priority'].astype('category').cat.categories))

In [102]:
Xp=get_embeddings(df['complaint_text'].tolist())
yp=df['priority_label']

In [103]:
Xp_train,Xp_test,yp_train,yp_test=train_test_split(Xp,yp,test_size=0.2,random_state=42)

In [104]:
model_pri=LogisticRegression(max_iter=1000)
model_pri.fit(Xp_train,yp_train)
yp_pred=model_pri.predict(Xp_test)

print("Priority Accuracy:", accuracy_score(yp_test,yp_pred))
print(classification_report(yp_test,yp_pred,target_names=pri_label_map.values()))

Priority Accuracy: 0.42857142857142855
              precision    recall  f1-score   support

        High       0.38      1.00      0.55         6
         Low       0.00      0.00      0.00         5
      Medium       0.60      0.30      0.40        10

    accuracy                           0.43        21
   macro avg       0.33      0.43      0.32        21
weighted avg       0.39      0.43      0.35        21



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [105]:
import joblib

In [106]:
joblib.dump(model_cat, "models/category_model.pkl")
joblib.dump(cat_label_map, "models/category_model_labels.pkl")

joblib.dump(model_pri, "models/priority_model.pkl")
joblib.dump(pri_label_map, "models/priority_model_labels.pkl")

print("✅ Models saved successfully.")


✅ Models saved successfully.


In [107]:
##############################    TESTING 

In [130]:
def predict (complaint_text,model,label_map):
    emb=get_embeddings([complaint_text])
    pred=model.predict(emb)[0]
    return label_map[pred]

In [131]:
testing_complaints="No water in our area for 2 days and pipeline is broken"

In [None]:
results = {}

In [138]:
print(predict(testing_complaints, model_cat, cat_label_map))

Water


In [133]:
print(predict(testing_complaints,model_pri,pri_label_map))

Medium


In [139]:
results = {}

results['department'] = predict(testing_complaints, model_cat, cat_label_map)
results['priority'] = predict(testing_complaints, model_pri, pri_label_map)

print(f"Predicted department: {results['department']}")
print(f"Predicted priority: {results['priority']}")


Predicted department: Water
Predicted priority: Medium
