# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import spacy
from nltk.stem import PorterStemmer


We will use PorterStemmer from nltk for stemming of the words and for rest of the processing we will use spacy.

In [2]:
stemmer=PorterStemmer()
nlp=spacy.load('en_core_web_sm')

Read the dataset.

In [3]:
data=pd.read_csv('/content/Evaluation-dataset.csv',)

In [4]:
data.columns=[f"col_{i}" for i in range(15)]

In [5]:
data.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14
0,"Easy Tyre Selection Process, Competitive Prici...",garage service positive,value for money positive,,,,,,,,,,,,
1,Very easy to use and good value for money.,value for money positive,,,,,,,,,,,,,
2,Really easy and convenient to arrange,ease of booking positive,,,,,,,,,,,,,
3,It was so easy to select tyre sizes and arrang...,location positive,value for money positive,ease of booking positive,,,,,,,,,,,
4,service was excellent. Only slight downside wa...,length of fitting positive,ease of booking positive,ease of booking negative,,,,,,,,,,,


We will convert our text into tokens using count vectorizer so for this we have to remove unnecessary stopwords,punctuation and numbers from the text so that we can reduce the dimensionality of our data.

In [6]:
# stop words removal, stemming lemmatization
def clean_text(x:str):
    x=x.lower()
    cleaned=[]
    doc=nlp(x)
    for token in doc:
        if(token.is_alpha and not token.is_punct and not token.is_stop):
            cleaned.append(stemmer.stem(token.lemma_))
    return " ".join(cleaned)

In [7]:
data['col_0']=data['col_0'].apply(clean_text)

In [8]:
data.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14
0,easi tyre select process competit price excel ...,garage service positive,value for money positive,,,,,,,,,,,,
1,easi use good valu money,value for money positive,,,,,,,,,,,,,
2,easi conveni arrang,ease of booking positive,,,,,,,,,,,,,
3,easi select tyre size arrang local fit price c...,location positive,value for money positive,ease of booking positive,,,,,,,,,,,
4,servic excel slight downsid know exact time ga...,length of fitting positive,ease of booking positive,ease of booking negative,,,,,,,,,,,


In [9]:
data['col_1'].unique()

array(['garage service positive', 'value for money positive',
       'ease of booking positive', 'location positive',
       'length of fitting positive', 'ease of booking negative',
       'tyre quality positive', 'garage service negative',
       'wait time negative', 'delivery punctuality positive',
       'wait time positive', 'location negative', 'damage negative',
       'extra charges positive', 'value for money negative',
       'mobile fitter positive', 'advisor/agent service positive',
       'facilities positive', nan, 'change of time negative',
       'extra charges negative', 'late notice negative',
       'discounts positive', 'delivery punctuality negative',
       'refund not actioned positive', 'change of date negative',
       'booking confusion negative', 'advisoragent service positive',
       'advisor/agent service negative', 'advisoragent service negative',
       'incorrect tyres sent negative', 'tyre quality negative',
       'response time negative', 'refund po

here we are counting how many sentiments are there and their frequency.

In [11]:
sent = {}

for i in data.columns:
    if i!='col_0':
        z = data[i].dropna()
        for l in z:
            if l not in sent.keys():
                sent[l] = 1
            else:
                sent[l]+=1

In [12]:
print(sent)

{'garage service positive': 2030, 'value for money positive': 4780, 'ease of booking positive': 1186, 'location positive': 1063, 'length of fitting positive': 657, 'ease of booking negative': 227, 'tyre quality positive': 434, 'garage service negative': 423, 'wait time negative': 135, 'delivery punctuality positive': 453, 'wait time positive': 274, 'location negative': 27, 'damage negative': 127, 'extra charges positive': 85, 'value for money negative': 136, 'mobile fitter positive': 225, 'advisor/agent service positive': 202, 'facilities positive': 33, 'change of time negative': 42, 'extra charges negative': 46, 'late notice negative': 76, 'discounts positive': 115, 'delivery punctuality negative': 250, 'refund not actioned positive': 1, 'change of date negative': 277, 'booking confusion negative': 119, 'advisoragent service positive': 233, 'advisor/agent service negative': 47, 'advisoragent service negative': 125, 'incorrect tyres sent negative': 70, 'tyre quality negative': 40, 'res

we will remove those sentiments which has less than 30 frequency otherwise our model will be overfitted in this case because it is a large dataset.

In [13]:
# remove those values which has frequency less than 30
sentiments=[]
for key,values in sent.items():
    if values>30:
        sentiments.append(key)

In [14]:
print(sentiments)
print(len(sentiments))

['garage service positive', 'value for money positive', 'ease of booking positive', 'location positive', 'length of fitting positive', 'ease of booking negative', 'tyre quality positive', 'garage service negative', 'wait time negative', 'delivery punctuality positive', 'wait time positive', 'damage negative', 'extra charges positive', 'value for money negative', 'mobile fitter positive', 'advisor/agent service positive', 'facilities positive', 'change of time negative', 'extra charges negative', 'late notice negative', 'discounts positive', 'delivery punctuality negative', 'change of date negative', 'booking confusion negative', 'advisoragent service positive', 'advisor/agent service negative', 'advisoragent service negative', 'incorrect tyres sent negative', 'tyre quality negative', 'response time negative', 'no stock negative', 'length of fitting negative', 'response time positive']
33


we have to transform our dataset such that we can train one vs rest type of classifier to solve our problem so we will be having sentence and all the sentiments in the dataset as columns and if the sentence has a particular sentiments the we will mark it 1 otherwise 0 (just like one hot encoding).

In [15]:
def get_new_data(data:pd.DataFrame)->pd.DataFrame:
    new_data=pd.DataFrame(index=range(0,data.iloc[:,1:].shape[0]),columns=sentiments)
    for index,row in data.iloc[:,1:].iterrows():
        subthemes=row.dropna().tolist()
        for theme in subthemes:
            if theme in sentiments:
                new_data.loc[index,theme]=1

    new_data.fillna(0,inplace=True)
    new_data['sentence']=data['col_0']
    return new_data

In [16]:
new_data=get_new_data(data)

In [17]:
new_data.head()

Unnamed: 0,garage service positive,value for money positive,ease of booking positive,location positive,length of fitting positive,ease of booking negative,tyre quality positive,garage service negative,wait time negative,delivery punctuality positive,...,advisoragent service positive,advisor/agent service negative,advisoragent service negative,incorrect tyres sent negative,tyre quality negative,response time negative,no stock negative,length of fitting negative,response time positive,sentence
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,easi tyre select process competit price excel ...
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,easi use good valu money
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,easi conveni arrang
3,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,easi select tyre size arrang local fit price c...
4,0,0,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,servic excel slight downsid know exact time ga...


In [18]:
new_data.head()

Unnamed: 0,garage service positive,value for money positive,ease of booking positive,location positive,length of fitting positive,ease of booking negative,tyre quality positive,garage service negative,wait time negative,delivery punctuality positive,...,advisoragent service positive,advisor/agent service negative,advisoragent service negative,incorrect tyres sent negative,tyre quality negative,response time negative,no stock negative,length of fitting negative,response time positive,sentence
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,easi tyre select process competit price excel ...
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,easi use good valu money
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,easi conveni arrang
3,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,easi select tyre size arrang local fit price c...
4,0,0,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,servic excel slight downsid know exact time ga...


since we have some data in the CSV file which has no label so we will first filter out that data which has atleast one label.

In [19]:
# filter out row which has no subthemes
new_data1=new_data[new_data.iloc[:,:-1].sum(axis=1)!=0]

In [20]:
new_data1

Unnamed: 0,garage service positive,value for money positive,ease of booking positive,location positive,length of fitting positive,ease of booking negative,tyre quality positive,garage service negative,wait time negative,delivery punctuality positive,...,advisoragent service positive,advisor/agent service negative,advisoragent service negative,incorrect tyres sent negative,tyre quality negative,response time negative,no stock negative,length of fitting negative,response time positive,sentence
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,easi tyre select process competit price excel ...
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,easi use good valu money
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,easi conveni arrang
3,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,easi select tyre size arrang local fit price c...
4,0,0,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,servic excel slight downsid know exact time ga...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10124,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,issu tyre fit garag easi cost effect servic
10125,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,easi order tyre choic garag tyre fit
10126,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,order wrong tyre redact arrang collect suppli ...
10127,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,good experi time redact harborn tyre effici co...


In [21]:
new_data1.shape

(8067, 34)

Now we will try to fit different models to check how they perform for this classification task. So will start with logistic regression

In [22]:
# now make count vectorizer of sentence column
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
cv.fit(new_data1['sentence'])
X=cv.transform(new_data1['sentence'])

In [24]:
# now make train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,new_data1.iloc[:,:-1],test_size=0.2,random_state=42)

In [25]:
# now train the model fit a one vs rest classifier with logistic regression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
clf=OneVsRestClassifier(LogisticRegression())
clf.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [26]:
# now make prediction
y_pred=clf.predict(X_test)

In [27]:
# now check the accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.4646840148698885

Logistic Regression has very poor performance this is worst than random guess.

In [28]:
# now check the f1 score
from sklearn.metrics import f1_score
f1_score(y_test,y_pred,average='micro')

0.713353115727003

In [29]:
# now fit multinoial naive bayes model
from sklearn.naive_bayes import MultinomialNB
clf=OneVsRestClassifier(MultinomialNB())
clf.fit(X_train,y_train)

In [30]:
# now make prediction
y_pred=clf.predict(X_test)

In [31]:
# now check the accuracy
accuracy_score(y_test,y_pred)

0.3252788104089219

Multinomial Naive bayes is also very poor.

In [32]:
# fit the model with random forest
from sklearn.ensemble import RandomForestClassifier
clf=OneVsRestClassifier(RandomForestClassifier())
clf.fit(X_train,y_train)

In [33]:
# now make prediction
y_pred=clf.predict(X_test)

In [34]:
# now check the accuracy
accuracy_score(y_test,y_pred)

0.42998760842627015

Random forest is also not so good.

In [35]:
# now fit svm model
from sklearn.svm import SVC
clf=OneVsRestClassifier(SVC(kernel='rbf'))
clf.fit(X_train,y_train)

In [36]:
# now make prediction
y_pred=clf.predict(X_test)

In [37]:
# now check the accuracy
accuracy_score(y_test,y_pred)

0.43742255266418834

In [38]:
# fit knn classifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
clf=OneVsRestClassifier(KNeighborsClassifier())
clf.fit(X_train,y_train)

In [39]:
# now make prediction
y_pred=clf.predict(X_test)

In [40]:
# now check accuracy
accuracy_score(y_test,y_pred)

0.33209417596034696

so all the traditional ML models do not perform well on the given problem.
Now we will use nomic embedding to make embeddings of our data then we will use a neural network model for the classification.

In [50]:
! pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence_transform

In [51]:
! pip install einops

Collecting einops
  Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.8.0


In [76]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
sentences = ['search_query: What is TSNE?', 'search_query: Who is Laurens van der Maaten?']
embeddings = model.encode(sentences)
print(embeddings)




[[ 1.0951390e-02  5.7414643e-02 -1.1036426e-02 ...  3.5168876e-05
  -2.8092174e-02 -2.1599863e-02]
 [-1.3366988e-02  2.7091298e-02 -2.3367383e-02 ...  2.8799376e-02
  -1.0674731e-02  2.8820800e-02]]


In [53]:
embeddings=model.encode(new_data1['sentence'].tolist())

In [54]:
len(embeddings)

8067

In [55]:
len(new_data1)

8067

In [56]:
type(embeddings)

numpy.ndarray

In [57]:
embeddings.shape

(8067, 768)

In [58]:
import torch

### Define the model

In [59]:
# make a pytorch model for classification of single category
import torch.nn as nn

class SingleCategoryClassifier(nn.Module):
  def __init__(self,input_dim):
    super(SingleCategoryClassifier, self).__init__()
    self.linear = nn.Linear(input_dim, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    x = self.linear(x)
    x = self.sigmoid(x)
    return x

since our embedding size is 768 so we will use it as input shape for our model.

In [67]:
models = {sentiment:SingleCategoryClassifier(768) for sentiment in sentiments}

In [61]:
from sklearn.model_selection import train_test_split

we will fit one seperate model for each of the sentiment. so we have to fit 33 model for our 33 sentiments.

In [62]:
x_train,x_test,y_train,y_test=train_test_split(embeddings,new_data1[sentiments],test_size=0.2,random_state=42)

In [63]:
y_train.shape

(6453, 33)

In [68]:
loss_fn = nn.BCELoss()
optimizer = {sentiment:torch.optim.Adam(models[sentiment].parameters(), lr=0.01) for sentiment in sentiments}

In [69]:
# Define the loss function and optimize
# Prepare the training data
# Assume you have tensors called train_data and train_labels

# Training loop
for epoch in range(10):
  for sentiment, model in models.items():
    for x, y in zip(torch.Tensor(x_train), torch.Tensor(y_train[sentiment].to_numpy())):
      # Forward pass
      y_pred = model(x)

      # Calculate the loss
      y=torch.Tensor([y])
      loss = loss_fn(y_pred, y)

      # Backpropagation
      optimizer[sentiment].zero_grad()
      loss.backward()
      optimizer[sentiment].step()

      # Print the loss after each epoch
      print(f"model:{sentiment} Epoch {epoch+1}: Loss = {loss.item():.4f}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
model:response time positive Epoch 10: Loss = 0.0003
model:response time positive Epoch 10: Loss = 0.0000
model:response time positive Epoch 10: Loss = 0.0001
model:response time positive Epoch 10: Loss = 0.0000
model:response time positive Epoch 10: Loss = 0.0000
model:response time positive Epoch 10: Loss = 0.0000
model:response time positive Epoch 10: Loss = 0.0005
model:response time positive Epoch 10: Loss = 0.0003
model:response time positive Epoch 10: Loss = 0.0041
model:response time positive Epoch 10: Loss = 0.0010
model:response time positive Epoch 10: Loss = 0.0036
model:response time positive Epoch 10: Loss = 0.0001
model:response time positive Epoch 10: Loss = 0.0006
model:response time positive Epoch 10: Loss = 0.0000
model:response time positive Epoch 10: Loss = 0.0001
model:response time positive Epoch 10: Loss = 0.0004
model:response time positive Epoch 10: Loss = 0.0001
model:response time positive Epoch

In [70]:
# calculate the accuracy scores of the models
for sentiment, model in models.items():
  y_pred = model(torch.Tensor(x_test))
  y_pred = (y_pred > 0.5).float()
  accuracy = (y_pred == torch.Tensor(y_test[sentiment].to_numpy())).float().mean()
  print(f"model:{sentiment} Accuracy = {accuracy.item():.4f}")

model:garage service positive Accuracy = 0.6103
model:value for money positive Accuracy = 0.5200
model:ease of booking positive Accuracy = 0.7618
model:location positive Accuracy = 0.7162
model:length of fitting positive Accuracy = 0.8842
model:ease of booking negative Accuracy = 0.9699
model:tyre quality positive Accuracy = 0.9258
model:garage service negative Accuracy = 0.9258
model:wait time negative Accuracy = 0.9719
model:delivery punctuality positive Accuracy = 0.9249
model:wait time positive Accuracy = 0.9693
model:damage negative Accuracy = 0.9828
model:extra charges positive Accuracy = 0.9846
model:value for money negative Accuracy = 0.9864
model:mobile fitter positive Accuracy = 0.9617
model:advisor/agent service positive Accuracy = 0.9581
model:facilities positive Accuracy = 0.9920
model:change of time negative Accuracy = 0.9932
model:extra charges negative Accuracy = 0.9871
model:late notice negative Accuracy = 0.9773
model:discounts positive Accuracy = 0.9671
model:deliver

Here we can see even the worst model is performing better than the traditional ML models.

In [73]:
# save the models
for sentiment, model in models.items():
  torch.save(model.state_dict(), f"/content/{sentiment}.pt")

RuntimeError: Parent directory /content/advisor does not exist.

In [74]:
# define function to predict the sentiment of a single sentence
def predict_sentiment(sentence):
  sentence=clean_text(sentence)
  embedding=model.encode([sentence])
  predictions = {sentiment:models[sentiment](torch.Tensor(embedding)).item() for sentiment in sentiments}
  return predictions

In [78]:
predict_sentiment("Excellent. Great service with a good selection of garages")

{'garage service positive': 0.6119422912597656,
 'value for money positive': 0.38660943508148193,
 'ease of booking positive': 0.1361989825963974,
 'location positive': 0.050796929746866226,
 'length of fitting positive': 0.004891643300652504,
 'ease of booking negative': 0.0002614726254250854,
 'tyre quality positive': 0.0016786883352324367,
 'garage service negative': 0.0008798635099083185,
 'wait time negative': 5.696243988495553e-06,
 'delivery punctuality positive': 0.0007102894596755505,
 'wait time positive': 0.0010868696263059974,
 'damage negative': 6.057675818738062e-06,
 'extra charges positive': 0.0001694254606263712,
 'value for money negative': 5.265363597573014e-06,
 'mobile fitter positive': 0.000917706813197583,
 'advisor/agent service positive': 0.0018844096921384335,
 'facilities positive': 1.7336446944682393e-08,
 'change of time negative': 8.421788152190857e-06,
 'extra charges negative': 2.225236812591902e-06,
 'late notice negative': 1.1202765790585545e-06,
 'dis