# Import necessary libraries
First, we import the necessary libraries including pandas and TensorFlow.

In [1]:
import pandas as pd
import tensorflow as tf




# Load and preview the dataset
We load the dataset from a CSV file and preview the first few rows.

In [206]:
df = pd.read_csv('https://raw.githubusercontent.com/shreerampawar/Projects/main/Subtheme%20Sentiment/Evaluation-dataset.csv', header = None)

# Check the shape of the dataset
We check the number of rows and columns in the dataset.

In [207]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,Tires where delivered to the garage of my choi...,garage service positive,ease of booking positive,,,,,,,,,,,,
1,"Easy Tyre Selection Process, Competitive Prici...",garage service positive,value for money positive,,,,,,,,,,,,
2,Very easy to use and good value for money.,value for money positive,,,,,,,,,,,,,
3,Really easy and convenient to arrange,ease of booking positive,,,,,,,,,,,,,
4,It was so easy to select tyre sizes and arrang...,location positive,value for money positive,ease of booking positive,,,,,,,,,,,


In [4]:
df.shape

(10132, 15)

# Check for missing values
We check for any missing values in the dataset.


In [5]:
df.isna().sum()

0         0
1      2003
2      5893
3      8594
4      9687
5     10004
6     10096
7     10119
8     10124
9     10128
10    10131
11    10131
12    10131
13    10131
14    10131
dtype: int64

# Fill missing values
We fill any missing values with an empty string.


In [6]:
df.fillna('', inplace=True)

In [7]:
df.columns

Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='int64')

# Assign column names
We create a list of column names based on the number of columns and assign these names to the DataFrame.


In [8]:
num_columns = len(df.columns)
column_names = [f'subsen_{i}' for i in range(1, num_columns)]
# Assign column names
df.columns = ['review_text'] + column_names
# df.columns = column_names
df.columns.values[0] = 'comment_text'

# Count unique labels
We count the number of unique labels in the dataset.


In [9]:
unique_labels = df.iloc[:, 1:14].stack().unique()
num_unique_labels = len(unique_labels)
print("Number of unique labels:", num_unique_labels)
label_counts = df.iloc[:, 1:14].stack().value_counts()


Number of unique labels: 107


# Count the occurrences of each label
We count how many times each label appears in the dataset.


In [11]:
print("Count of all labels:")
for label, count in label_counts.items():
    print(f"{label}: {count}")
# Count the number of non-empty labels for each entry, excluding the placeholder value
df['num_labels'] = df.iloc[:, 1:].apply(lambda row: row[row != ''].count(), axis=1)


Count of all labels:
: 117172
value for money positive: 4780
garage service positive: 2031
ease of booking positive: 1187
location positive: 1063
length of fitting positive: 657
delivery punctuality positive: 453
tyre quality positive: 434
garage service negative: 423
change of date negative: 277
wait time positive: 274
delivery punctuality negative: 250
advisoragent service positive: 233
ease of booking negative: 227
mobile fitter positive: 225
advisor/agent service positive: 202
value for money negative: 136
wait time negative: 135
damage negative: 127
advisoragent service negative: 125
booking confusion negative: 119
discounts positive: 115
length of fitting negative: 109
extra charges positive: 85
response time negative: 77
late notice negative: 76
incorrect tyres sent negative: 70
advisor/agent service negative: 47
extra charges negative: 46
change of time negative: 42
no stock negative: 42
tyre quality negative: 40
response time positive: 34
facilities positive: 33
mobile fitter 

# Count of entries per label
We count the number of entries per label.


In [12]:
label_counts = df['num_labels'].value_counts().sort_index()
print("Number of Labels : Number of Entries")
for num_labels, count in label_counts.items():
    print(f"{num_labels} : {count} entries")


Number of Labels : Number of Entries
0 : 2003 entries
1 : 3890 entries
2 : 2701 entries
3 : 1093 entries
4 : 316 entries
5 : 93 entries
6 : 23 entries
7 : 5 entries
8 : 5 entries
9 : 2 entries
14 : 1 entries


In [14]:
df[df['num_labels'] == 14]

Unnamed: 0,comment_text,subsen_1,subsen_2,subsen_3,subsen_4,subsen_5,subsen_6,subsen_7,subsen_8,subsen_9,subsen_10,subsen_11,subsen_12,subsen_13,subsen_14,num_labels
384,"test review, should be deleted",refund not actioned positive,refund timescale positive,mobile fitter didn't arrive positive,change of time positive,discount not applied positive,tyre agedot code positive,failed payment positive,late notice positive,facilities positive,incorrect tyres sent positive,call wait time positive,refund positive,no stock positive,balancing positive,14


# Remove entries with 14 labels
We remove the entry that have 14 labels, as mentioned in the specific row.


In [16]:
df.drop(384, inplace = True)

In [17]:
df.shape

(10131, 16)

In [18]:
df.columns


Index(['comment_text', 'subsen_1', 'subsen_2', 'subsen_3', 'subsen_4',
       'subsen_5', 'subsen_6', 'subsen_7', 'subsen_8', 'subsen_9', 'subsen_10',
       'subsen_11', 'subsen_12', 'subsen_13', 'subsen_14', 'num_labels'],
      dtype='object')

In [52]:
df

Unnamed: 0,comment_text,subsen_1,subsen_2,subsen_3,subsen_4,subsen_5,subsen_6,subsen_7,subsen_8,subsen_9,subsen_10,subsen_11,subsen_12,subsen_13,subsen_14,num_labels
0,Tires where delivered to the garage of my choi...,garage service positive,ease of booking positive,,,,,,,,,,,,,2
1,"Easy Tyre Selection Process, Competitive Prici...",garage service positive,value for money positive,,,,,,,,,,,,,2
2,Very easy to use and good value for money.,value for money positive,,,,,,,,,,,,,,1
3,Really easy and convenient to arrange,ease of booking positive,,,,,,,,,,,,,,1
4,It was so easy to select tyre sizes and arrang...,location positive,value for money positive,ease of booking positive,,,,,,,,,,,,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10127,"I ordered the wrong tyres, however [REDACTED] ...",refund positive,delivery punctuality positive,refund timescale positive,,,,,,,,,,,,3
10128,"Good experience, first time I have used [REDAC...",length of fitting positive,,,,,,,,,,,,,,1
10129,"I ordered the tyre I needed on line, booked a ...",location positive,delivery punctuality positive,length of fitting positive,value for money positive,,,,,,,,,,,4
10130,Excellent service from point of order to fitti...,,,,,,,,,,,,,,,0


# Convert labels to binary format
We use MultiLabelBinarizer to convert the labels to binary format due to the multi-label nd non sequential structure of the labels


In [22]:
from sklearn.preprocessing import MultiLabelBinarizer

# Extract labels from the DataFrame
labels = df[['subsen_1', 'subsen_2', 'subsen_3', 'subsen_4', 'subsen_5', 'subsen_6', 'subsen_7', 'subsen_8', 'subsen_9', 'subsen_10', 'subsen_11', 'subsen_12', 'subsen_13', 'subsen_14']].apply(lambda x: x.dropna().tolist(), axis=1)

# Initialize the MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Fit and transform the labels
binary_labels = mlb.fit_transform(labels)

# Get the unique labels
unique_labels = mlb.classes_

In [23]:
binary_labels.shape

(10131, 99)

In [167]:
binary_labels

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       ...,
       [1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [51]:
unique_labels

array(['', ' Cheapest price',
       ' Garage was quick & efficient with fitting of tyres',
       ' Great garage fitted them',
       ' ability to browse through various makes of tyres before making a decision',
       ' and an excellent service from the garage who fitted the tyre.',
       ' and both times have been good experiences.',
       ' cheaper than anything else I have found by some way.',
       ' clean reception area and free coffee',
       ' easily navigable web site',
       ' easy to browse selection of tyres before making a choice',
       ' faster than dealerships to arrange supply and partnership with ATS works well .',
       ' fitting took a long time.', ' good communication',
       ' good local fitting services', ' good price', ' good prices',
       ' good range and competitive prices on website', ' good service',
       ' great price.', ' hassle free',
       ' it feels more transparent than buying from a mechanic based on their opinion.',
       ' keep it up!

In [26]:
df.columns

Index(['comment_text', 'subsen_1', 'subsen_2', 'subsen_3', 'subsen_4',
       'subsen_5', 'subsen_6', 'subsen_7', 'subsen_8', 'subsen_9', 'subsen_10',
       'subsen_11', 'subsen_12', 'subsen_13', 'subsen_14', 'num_labels'],
      dtype='object')

In [150]:
X = df[df.columns[0]]
y = binary_labels

In [151]:
y

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       ...,
       [1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

# Tokenize the text
We tokenize the text data to prepare it for model training.

In [152]:
from tensorflow.keras.layers import TextVectorization
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X.values)

# Get the total number of unique words
total_unique_words = len(tokenizer.word_index)
print("Total unique words:", total_unique_words)

Total unique words: 7237


In [153]:
MAX_FEATURES = 10000
vectorizer = TextVectorization(max_tokens=MAX_FEATURES, output_sequence_length=200, output_mode='int')
vectorizer.adapt(X.values)
vectorized_text = vectorizer(X.values)

# Create a TensorFlow dataset
We create a TensorFlow dataset from the vectorized text and binary labels.


In [154]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(11000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [155]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

# Build the model
We build a sequential neural network model with embedding, LSTM, and dense layers.


In [156]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [157]:
model = Sequential()
# Create the embedding layer
model.add(Embedding(MAX_FEATURES+1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer
model.add(Dense(99, activation='sigmoid'))

In [158]:
model.compile(loss='BinaryCrossentropy', optimizer='adam', metrics = ['accuracy'])

# Train the model
We train the model on the training dataset and validate it on the validation dataset.


In [159]:
model.fit(train, epochs=3, validation_data=val)

Epoch 1/3


Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x2b804831b70>

# Evaluate the model
We evaluate the model on the test dataset.


In [160]:
model.evaluate(test)



[0.03194938227534294, 1.0]

# Save and load the model
We save the trained model and then load it for prediction.


In [175]:
model.save('subtheme_sentiment.h5')

  saving_api.save_model(


In [176]:
mdl = tf.keras.models.load_model('subtheme_sentiment.h5')

# Make a prediction
We make a prediction on a sample input and print the rounded results.


In [196]:
input_str = vectorizer("Great price and easy to use")

In [197]:
res = mdl.predict(np.expand_dims(input_str,0))



In [198]:
res

array([[9.99999404e-01, 9.04504759e-06, 2.52163281e-05, 1.88177073e-05,
        9.55528521e-05, 4.99582493e-05, 2.17402539e-05, 6.89228618e-05,
        2.98056079e-07, 7.46050937e-05, 3.76551498e-06, 6.60081350e-05,
        2.44110088e-05, 2.45509946e-05, 5.49552133e-05, 2.53012640e-05,
        8.14601153e-05, 7.28254554e-06, 1.31625336e-06, 2.32204493e-05,
        1.70670810e-05, 3.83730867e-06, 6.67031281e-06, 2.40793379e-05,
        1.25777879e-05, 6.65243624e-06, 2.74129252e-06, 6.97393043e-05,
        4.26817587e-06, 4.37325895e-07, 1.78307291e-05, 9.41745952e-07,
        3.94700546e-05, 2.70587207e-05, 1.24826229e-05, 6.12002186e-05,
        3.95770940e-06, 2.56510139e-05, 2.24398173e-06, 1.13203541e-05,
        8.91313175e-05, 1.18354510e-05, 7.76586830e-07, 6.51616938e-05,
        3.99959134e-07, 4.19400306e-03, 3.24039975e-05, 2.04089354e-03,
        1.56000504e-04, 2.43248887e-05, 2.34287581e-05, 2.74571576e-05,
        1.09077573e-05, 1.04475812e-05, 9.98140968e-06, 6.803961

In [199]:
np.round(res).astype(int)

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]])

In [201]:
res = mdl.predict(np.expand_dims(input_str,0))
rounded_res = np.round(res)
predicted_label_text = mlb.inverse_transform(rounded_res)[0]

# Exclude the first element
predicted_labels_except_first = predicted_label_text[1:]

# Format the output as a string
output_str = ", ".join(predicted_labels_except_first)

print(output_str)

value for money positive


# Make another prediction
We make another prediction on a sample input and print the rounded results.


In [202]:
input_str = vectorizer("Competitively priced and easy to use fitting centre near me who were very professional.")
res = mdl.predict(np.expand_dims(input_str,0))
rounded_res = np.round(res)
predicted_label_text = mlb.inverse_transform(rounded_res)[0]
predicted_labels_except_first = predicted_label_text[1:]
output_str = ", ".join(predicted_labels_except_first)
print(output_str)

location positive, value for money positive


# Summary and Conclusion
Thus we see the desired outputs in the format of 'subtheme sentiment'. This notebook covers the process of loading and preprocessing a dataset, building and training a machine learning model for sentiment analysis, and testing the model with sample inputs. Despite achieving high accuracy during training and evaluation, the model's performance on individual predictions suggests areas for improvement and issues which could be well resolved using transfer learning. Further steps could include refining/tuning the model, improving the data preprocessing steps, and conducting a more thorough analysis of the model's predictions.