# Import Module

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import warnings
%matplotlib inline

warnings.filterwarnings('ignore')

# Loading the Dataset

In [20]:
df = pd.read_csv('C:\\Users\\Sakshi Rathore\\Downloads\\Evaluation-dataset.csv')
df.head()

Unnamed: 0,"Tires where delivered to the garage of my choice,the garage notified me when they had been delivered. A day and time was arranged with the garage and I went and had them fitted,a Hassel free experience.",garage service positive,ease of booking positive,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,"Easy Tyre Selection Process, Competitive Prici...",garage service positive,value for money positive,,,,,,,,,,,,
1,Very easy to use and good value for money.,value for money positive,,,,,,,,,,,,,
2,Really easy and convenient to arrange,ease of booking positive,,,,,,,,,,,,,
3,It was so easy to select tyre sizes and arrang...,location positive,value for money positive,ease of booking positive,,,,,,,,,,,
4,service was excellent. Only slight downside wa...,length of fitting positive,ease of booking positive,ease of booking negative,,,,,,,,,,,


In [21]:
# datatype info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10131 entries, 0 to 10130
Data columns (total 15 columns):
 #   Column                                                                                                                                                                                                      Non-Null Count  Dtype 
---  ------                                                                                                                                                                                                      --------------  ----- 
 0   Tires where delivered to the garage of my choice,the garage notified me when they had been delivered. A day and time was arranged with the garage and I went and had them fitted,a Hassel free experience.  10131 non-null  object
 1   garage service positive                                                                                                                                                                       

# Preprocessing the dataset

In [22]:
# removes pattern in the input text
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for word in r:
        input_txt = re.sub(word, "", input_txt)
    return input_txt

In [23]:
df.head()

Unnamed: 0,"Tires where delivered to the garage of my choice,the garage notified me when they had been delivered. A day and time was arranged with the garage and I went and had them fitted,a Hassel free experience.",garage service positive,ease of booking positive,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,"Easy Tyre Selection Process, Competitive Prici...",garage service positive,value for money positive,,,,,,,,,,,,
1,Very easy to use and good value for money.,value for money positive,,,,,,,,,,,,,
2,Really easy and convenient to arrange,ease of booking positive,,,,,,,,,,,,,
3,It was so easy to select tyre sizes and arrang...,location positive,value for money positive,ease of booking positive,,,,,,,,,,,
4,service was excellent. Only slight downside wa...,length of fitting positive,ease of booking positive,ease of booking negative,,,,,,,,,,,


In [26]:
# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to C:\Users\Sakshi
[nltk_data]     Rathore\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Sakshi
[nltk_data]     Rathore\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [27]:
# Preprocess text function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\b\d+\b', '', text)  # Remove standalone numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

In [28]:
# Print the number of columns
print(f"Number of columns: {data.shape[1]}")
print("First few rows of the dataset:")
print(data.head())

Number of columns: 15
First few rows of the dataset:
  Tires where delivered to the garage of my choice,the garage notified me when they had been delivered. A day and time was arranged with the garage and I went and had them fitted,a Hassel free experience.  \
0  Easy Tyre Selection Process, Competitive Prici...                                                                                                                                                           
1         Very easy to use and good value for money.                                                                                                                                                           
2              Really easy and convenient to arrange                                                                                                                                                           
3  It was so easy to select tyre sizes and arrang...                                                               

In [29]:
# Based on the structure, assign column names appropriately.
# Assuming the first column is 'review', subsequent pairs of columns are 'aspect' and 'sentiment'.
num_columns = data.shape[1]
column_names = ['review']
for i in range(1, num_columns, 2):
    column_names.append(f'aspect_{i//2 + 1}')
    column_names.append(f'sentiment_{i//2 + 1}')

In [30]:
# Assign column names
data.columns = column_names

In [31]:
# Print the updated DataFrame with new column names
print("DataFrame with new column names:")
print(data.head())

DataFrame with new column names:
                                              review  \
0  Easy Tyre Selection Process, Competitive Prici...   
1         Very easy to use and good value for money.   
2              Really easy and convenient to arrange   
3  It was so easy to select tyre sizes and arrang...   
4  service was excellent. Only slight downside wa...   

                     aspect_1               sentiment_1  \
0     garage service positive  value for money positive   
1    value for money positive                       NaN   
2    ease of booking positive                       NaN   
3           location positive  value for money positive   
4  length of fitting positive  ease of booking positive   

                   aspect_2 sentiment_2 aspect_3 sentiment_3 aspect_4  \
0                       NaN         NaN      NaN         NaN      NaN   
1                       NaN         NaN      NaN         NaN      NaN   
2                       NaN         NaN      NaN        

In [32]:
# Now, extract and preprocess the 'review' column
texts = data['review'].apply(preprocess_text).tolist()

In [33]:
# Prepare a long-form DataFrame for aspect and sentiment pairs
aspects, sentiments = [], []
for i in range(1, num_columns, 2):
    aspects.extend(data[f'aspect_{i//2 + 1}'].dropna().tolist())
    sentiments.extend(data[f'sentiment_{i//2 + 1}'].dropna().tolist())

In [34]:
# Assuming we have a method to vectorize texts and classify aspects and sentiments
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)

In [40]:
# Prepare aspect and sentiment labels
aspects = data.filter(like='aspect').values.flatten()
sentiments = data.filter(like='sentiment').values.flatten()

In [44]:
# Prepare aspect labels
aspects = data.filter(like='aspect').values.flatten()

In [46]:
print("Number of missing values in aspects:", pd.Series(aspects).isnull().sum())

Number of missing values in aspects: 61104


In [54]:
print("Sample aspects:", aspects[:5])

Sample aspects: ['garage service positive' nan nan nan nan]


In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score

In [59]:
# Define the prediction function
def predict_subthemes(text):
    processed_text = preprocess_text(text)
    vectorized_text = vectorizer.transform([processed_text])
    
    predicted_aspect = aspect_model.predict(vectorized_text)[0]
    predicted_sentiment = sentiment_model.predict(vectorized_text)[0]
    
    return predicted_aspect, predicted_sentiment

In [63]:
# Debug print for X
print("Shape of X after vectorization:", X.shape)

# Prepare aspect and sentiment labels ensuring alignment
aspects = []
sentiments = []

for i in range(data.shape[0]):
    for j in range(1, num_columns, 2):
        aspect = data.iloc[i, j]
        sentiment = data.iloc[i, j + 1]
        if pd.notna(aspect) and pd.notna(sentiment):
            aspects.append(aspect)
            sentiments.append(sentiment)

# Convert aspects and sentiments to numpy arrays
import numpy as np
aspects = np.array(aspects)
sentiments = np.array(sentiments)

# Debug print for aspects and sentiments
print("Sample aspects:", aspects[:5])
print("Sample sentiments:", sentiments[:5])

# Ensure X, aspects, and sentiments have the correct shapes
print("Shape of X:", X.shape)
print("Number of elements in aspects array:", aspects.shape)
print("Number of elements in sentiments array:", sentiments.shape)

# Aspect Classification
X_train_aspect, X_test_aspect, y_train_aspect, y_test_aspect = train_test_split(X, aspects, test_size=0.2, random_state=42)
aspect_model = MultinomialNB()
aspect_model.fit(X_train_aspect, y_train_aspect)

# Sentiment Classification
X_train_sentiment, X_test_sentiment, y_train_sentiment, y_test_sentiment = train_test_split(X, sentiments, test_size=0.2, random_state=42)
sentiment_model = MultinomialNB()
sentiment_model.fit(X_train_sentiment, y_train_sentiment)

# Define the prediction function
def predict_subthemes(text):
    processed_text = preprocess_text(text)
    vectorized_text = vectorizer.transform([processed_text])
    
    predicted_aspect = aspect_model.predict(vectorized_text)[0]
    predicted_sentiment = sentiment_model.predict(vectorized_text)[0]
    
    return predicted_aspect, predicted_sentiment

# Example review for prediction
sample_review = "One tyre went missing, so there was a delay to get the two tyres fitted. The way garage dealt with it was fantastic."
sentences = sent_tokenize(sample_review)

for sentence in sentences:
    aspect, sentiment = predict_subthemes(sentence)
    print(f"Sentence: {sentence}")
    print(f"Aspect: {aspect}, Sentiment: {sentiment}")


Shape of X after vectorization: (10131, 7884)
Sample aspects: ['garage service positive' 'location positive'
 'length of fitting positive' 'garage service positive'
 'value for money positive']
Sample sentiments: ['value for money positive' 'value for money positive'
 'ease of booking positive' 'value for money positive' 'location positive']
Shape of X: (10131, 7884)
Number of elements in aspects array: (4729,)
Number of elements in sentiments array: (4729,)


ValueError: Found input variables with inconsistent numbers of samples: [10131, 4729]