In [78]:
# Load data
import pandas as pd
import joblib 
df = pd.read_csv("car_complaints_very_noise_10000.csv")

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
df

Unnamed: 0,Complaint,Problem
0,kuch burning me car lag jaisa raha,Fuel pump weak
1,lol qarning sqellfihside eatin,Faulty spark plug
2,pls engine oda sound mass ah varuthu,Fuel pump weak
3,engine la konjam smell varudhu,Weak battery
4,rnnine makes nqvse ranvomlr,Fuel pump weak
...,...,...
9995,Car stops suddenly,Fuel pump weak
9996,?? Sometimes I feel sleepy while working,Fuel pump weak
9997,idk engine la konjam smell varudhu,Loose wheel alignment
9998,Burning smell inside cabin and also Strange no...,Loose wheel alignment


In [79]:
X = df["Complaint"]
y = df["Problem"]

In [80]:
# Vectorizer
vectorizer = TfidfVectorizer(stop_words="english")
X_vec = vectorizer.fit_transform(X)

In [81]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)


In [82]:
# Model
svm = LinearSVC()
svm.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [83]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = svm.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.119
                       precision    recall  f1-score   support

          AC gas leak       0.11      0.09      0.10       217
  Dirty throttle body       0.13      0.20      0.16       195
    Faulty spark plug       0.12      0.15      0.13       259
       Fuel pump weak       0.13      0.14      0.13       236
Loose wheel alignment       0.14      0.11      0.12       226
             Oil leak       0.11      0.07      0.09       214
    Unbalanced wheels       0.12      0.09      0.10       214
         Weak battery       0.13      0.12      0.12       220
      Worn brake pads       0.09      0.09      0.09       219

             accuracy                           0.12      2000
            macro avg       0.12      0.12      0.12      2000
         weighted avg       0.12      0.12      0.12      2000



In [84]:
vectorizer = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1,2),       # bigrams improve accuracy
    min_df=3                 # remove rare/noisy words
)

X_vec = vectorizer.fit_transform(X)

svm = LinearSVC(C=2)   # slightly stronger margin
svm.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,2
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [85]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

In [86]:
df = pd.read_csv("car_complaints_10000.csv")

X = df["Complaint"]
y = df["Problem"]

In [87]:
# VECTORIZATION
# ----------------------------
vectorizer = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1,2),
    min_df=3)
X_vec = vectorizer.fit_transform(X)

In [88]:
# TRAIN / TEST SPLIT
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_vec, y, test_size=0.2, random_state=42
)

In [89]:
# TRAIN MODEL
# ----------------------------
svm = LinearSVC(C=2)
svm.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,2
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [90]:
# EVALUATE
# ----------------------------
pred = svm.predict(X_test)
print("Accuracy:", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

Accuracy: 1.0
                       precision    recall  f1-score   support

          AC gas leak       1.00      1.00      1.00       208
  Dirty throttle body       1.00      1.00      1.00       229
    Faulty spark plug       1.00      1.00      1.00       207
       Fuel pump weak       1.00      1.00      1.00       220
Loose wheel alignment       1.00      1.00      1.00       241
             Oil leak       1.00      1.00      1.00       228
    Unbalanced wheels       1.00      1.00      1.00       232
         Weak battery       1.00      1.00      1.00       218
      Worn brake pads       1.00      1.00      1.00       217

             accuracy                           1.00      2000
            macro avg       1.00      1.00      1.00      2000
         weighted avg       1.00      1.00      1.00      2000



In [91]:
def predict_problem(text):
    vec = vectorizer.transform([text])
    return svm.predict(vec)[0]

In [92]:
# Example test
print("\nTest Prediction:")
print(predict_problem("Car stalls when slowing down"))


Test Prediction:
Dirty throttle body


In [93]:
vec=TfidfVectorizer()

In [94]:
joblib.dump(svm,"model.pkl")
joblib.dump(vec,"vectorizer.pkl")


['vectorizer.pkl']