In [191]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gzip
import nltk
import spacy
import string
import re
import warnings
import emoji


from nltk.corpus import stopwords
from nltk.collocations import *
from collections import Counter
from nltk.stem import porter
from wordcloud import WordCloud, STOPWORDS
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC







nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('genesis')
nltk.download('averaged_perceptron_tagger')

stop_words = set(stopwords.words('english'))
stemmer = porter.PorterStemmer()

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

nlp = spacy.load("en_core_web_sm")

nltk.download('punkt')
nltk.download('brown')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\98936\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\98936\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package genesis to
[nltk_data]     C:\Users\98936\AppData\Roaming\nltk_data...
[nltk_data]   Package genesis is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\98936\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\98936\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\98936\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [192]:
# Importing the dataset
url = 'https://raw.githubusercontent.com/shahrzadko/NLPDigitalMusic/main/Digital_Music_5.json?token=GHSAT0AAAAAAB6LOL42SKHPCCMIVYPOC4REY7AKE2Q'
df_raw = pd.read_json(url)
full_df = pd.DataFrame.from_records(df_raw['data'])

In [193]:
#creating Sentiment column with overall rating
full_df['Sentiment'] = np.where((full_df['overall'] > 3), 'Positive', 
                          np.where((full_df['overall'] < 3), 'Negative', 'Neutral'))


In [194]:
#showing that the data is not balanced
full_df['Sentiment'].value_counts() 

Positive    158985
Neutral       6792
Negative      4004
Name: Sentiment, dtype: int64

In [195]:
#creating  'NewReview'  column using  'reviewText' and 'summary' columns
cols = ['reviewText','summary']
full_df['NewReview'] = full_df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
full_df['NewReview'] = full_df['NewReview'].astype(str)
full_df['review_length'] = full_df['NewReview'].apply(lambda x: len(x.split()))

In [196]:
music_df_balanced = full_df.drop(columns=['vote','image','reviewTime','style','reviewerName','unixReviewTime','summary','reviewText'])

In [197]:
#dropping rows if the review is not verified
music_df_balanced = music_df_balanced[music_df_balanced.verified == True]

#dropping rows if the reviewer generally gives very negative reviews
df_filtered = music_df_balanced.groupby('reviewerID').filter(lambda x: not((x['overall'].count() >= 10) and (x['overall'].mean() <= 2)))

#dropping rows if the review is too short
#df_filtered = df_filtered[df_filtered['review_length'] >= 6]


In [198]:
#checking the values of Sentiment to creat a sample data
df_filtered['Sentiment'].value_counts() 

Positive    141104
Neutral       5396
Negative      2093
Name: Sentiment, dtype: int64

In [199]:
import pandas as pd
import nltk
from nltk.corpus import wordnet


# Reduce the number of positive sentiment rows to 5000
df_pos = df_filtered[df_filtered['Sentiment'] == 'Positive'].sample(n=10000, random_state=42)

# Split the remaining dataset by sentiment
df_neutral = df_filtered[df_filtered['Sentiment'] == 'Neutral']
df_neg = df_filtered[df_filtered['Sentiment'] == 'Negative']

# Define function for synonym replacement
def synonym_replacement(text):
    words = nltk.word_tokenize(text)
    new_words = []
    for word in words:
        synonyms = []
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonyms.append(lemma.name())
        if synonyms:
            new_word = synonyms[0]
        else:
            new_word = word
        new_words.append(new_word)
    new_text = ' '.join(new_words)
    return new_text

# Oversample minority classes using synonym replacement
desired_balance = 0.9
while len(df_neutral) / len(df_pos) < desired_balance or len(df_neg) / len(df_pos) < desired_balance:
    if len(df_neutral) / len(df_pos) < desired_balance:
        sample = df_neutral.sample(n=1)['NewReview'].iloc[0]
        new_review = synonym_replacement(sample)
        df_neutral = df_neutral.append({'NewReview': new_review, 'Sentiment': 'Neutral'}, ignore_index=True)
    if len(df_neg) / len(df_pos) < desired_balance:
        sample = df_neg.sample(n=1)['NewReview'].iloc[0]
        new_review = synonym_replacement(sample)
        df_neg = df_neg.append({'NewReview': new_review, 'Sentiment': 'Negative'}, ignore_index=True)




# Combine oversampled subsets back into a single dataframe
df_balanced = pd.concat([df_pos, df_neutral, df_neg], ignore_index=True)

# Shuffle the dataframe
df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)


In [200]:
df_balanced['Sentiment'].value_counts() 

Positive    10000
Neutral      9000
Negative     9000
Name: Sentiment, dtype: int64

In [201]:
#creating balanced data using equal number of each sentiment
#sample_size = min(df_filtered['Sentiment'].value_counts())
#df_balanced = df_filtered.groupby('Sentiment').apply(lambda x: x.sample(sample_size)).reset_index(drop=True)

In [202]:
#df_balanced['Sentiment'].value_counts() 

In [203]:
df_balanced.head()

Unnamed: 0,overall,verified,reviewerID,asin,Sentiment,NewReview,review_length
0,5.0,True,A1LDU473TF2N1J,B0011Z30MS,Positive,Loved thia song rest in peace Five Stars,8.0
1,,,,,Neutral,Should rich_person listen to the whole thing ....,
2,,,,,Negative,poor_people sound quality two star,
3,5.0,True,AT0K5OWO5AO8X,B005UYZ5CO,Positive,excellent song thank you Five Stars,6.0
4,,,,,Negative,iodine never buy this ! ! ! ! ! ! ! one star,


In [204]:
#drop non-English comments
from langdetect import detect

def is_english(text):
    try:
        lang = detect(text)
        return lang == 'en'
    except:
        return False

df_balanced = df_balanced[df_balanced['NewReview'].apply(is_english)]


In [205]:
#checking the values of Sentiment after removing non-English reviews
df_balanced['Sentiment'].value_counts() 

Positive    9463
Neutral     8698
Negative    8201
Name: Sentiment, dtype: int64

In [206]:
#df_balanced = df_balanced.sample(n=1000, random_state=42)

In [207]:
df_balanced = df_balanced.drop(columns=['verified','overall','asin','reviewerID'])

In [208]:
df_balanced.head()

Unnamed: 0,Sentiment,NewReview,review_length
0,Positive,Loved thia song rest in peace Five Stars,8.0
1,Neutral,Should rich_person listen to the whole thing ....,
2,Negative,poor_people sound quality two star,
3,Positive,excellent song thank you Five Stars,6.0
4,Negative,iodine never buy this ! ! ! ! ! ! ! one star,


In [209]:
# import re
# import string
# from nltk.corpus import stopwords
# from nltk.stem import PorterStemmer, WordNetLemmatizer

# # Initialize stopwords
# STOPWORDS = set(stopwords.words('english'))

# # Convert NewReview to lowercase
# df_balanced['NewReview'] = df_balanced['NewReview'].str.lower()

# # Remove URLs from NewReview
# df_balanced['NewReview'] = df_balanced['NewReview'].apply(lambda x: re.sub(r'http\S+', '', x))

# # Remove currency symbols from NewReview
# df_balanced['NewReview'] = df_balanced['NewReview'].apply(lambda x: re.sub(r'£|\$', '', x))

# # Remove phone numbers from NewReview
# df_balanced['NewReview'] = df_balanced['NewReview'].apply(lambda x: re.sub(r'^[\(]?[\d]{3}[\)\-]?[\s\.]?[\d]{3}[\-\s\.]?[\d]{4}$', '', x))

# # Remove digits from NewReview
# df_balanced['NewReview'] = df_balanced['NewReview'].apply(lambda x: re.sub(r'\d+', '', x))

# # Remove email addresses from NewReview
# df_balanced['NewReview'] = df_balanced['NewReview'].apply(lambda x: re.sub(r'^.+@[^\.].*\.[a-z]{2,}$', '', x))

# # Remove special characters from NewReview
# df_balanced['NewReview'] = df_balanced['NewReview'].apply(lambda x: re.sub(r'[\'“’]', '', x))

# # Remove stopwords from NewReview
# df_balanced['NewReview'] = df_balanced['NewReview'].apply(lambda x: " ".join([word for word in x.split() if word.lower() not in STOPWORDS]))

# # Remove empty values from NewReview
# df_balanced['NewReview'] = df_balanced['NewReview'].apply(lambda x: '' if pd.isna(x) or not isinstance(x, str) else x)


In [210]:


# Initialize stopwords
STOPWORDS = set(stopwords.words('english'))
# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Define function for text cleaning
def clean_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove currency symbols
    text = re.sub(r'£|\$', '', text)

    # Remove phone numbers
    text = re.sub(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', '', text)

    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # Remove punctuation and special characters except emojis
    text = re.sub(r'[^\w\s' + emoji.get_emoji_regexp() + ']', '', text)

    # Tokenize text into words
    words = nltk.word_tokenize(text)
    
    # Remove stopwords and handle negation
    negation_words = ['not', 'no', 'never', 'neither', 'nor']
    words = [word if word.lower() in negation_words else word.lower() for word in words]
    words = [word for word in words if word.lower() not in STOPWORDS]
    words = [f'NOT_{words[i+1]}' if (i < len(words)-1 and words[i].lower() in negation_words and words[i+1] not in string.punctuation) else words[i] for i in range(len(words))]
    
    # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]

    # Stem words
    words = [stemmer.stem(word) for word in words]

    # Join words back into text
    cleaned_text = ' '.join(words)
    return cleaned_text


In [211]:
df_balanced['processed_review'] = df_balanced['NewReview'].apply(process_text)

In [212]:
df_balanced.head()

Unnamed: 0,Sentiment,NewReview,review_length,processed_review
0,Positive,Loved thia song rest in peace Five Stars,8.0,love thia song rest in peac five star
1,Neutral,Should rich_person listen to the whole thing ....,,should rich_person listen to the whole thing ....
2,Negative,poor_people sound quality two star,,poor_peopl sound qualiti two star
3,Positive,excellent song thank you Five Stars,6.0,excel song thank you five star
4,Negative,iodine never buy this ! ! ! ! ! ! ! one star,,iodin never buy thi ! ! ! ! ! ! ! one star


In [213]:
df_balanced = df_balanced.drop(columns=['NewReview'])

In [214]:
df_balanced.head()

Unnamed: 0,Sentiment,review_length,processed_review
0,Positive,8.0,love thia song rest in peac five star
1,Neutral,,should rich_person listen to the whole thing ....
2,Negative,,poor_peopl sound qualiti two star
3,Positive,6.0,excel song thank you five star
4,Negative,,iodin never buy thi ! ! ! ! ! ! ! one star


In [215]:
X = pd.DataFrame(df_balanced['processed_review'])
y = df_balanced['Sentiment']

TfidfVectorizer / Naive Bayes

In [216]:
# Split the data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

# Fit the pipeline on the training data
pipeline.fit(X_train['processed_review'], y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test['processed_review'])

# Calculate the accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.3f}")


Accuracy: 0.846


Count Vectorizer / Naive Bayse

In [217]:
# Split the data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
 
# Define CountVectorizer
count_vectorizer = CountVectorizer()

# Define Naive Bayes classifier
nb_classifier = MultinomialNB()

# Combine CountVectorizer and Naive Bayes classifier into a pipeline
pipeline = Pipeline([
    ('vectorizer', count_vectorizer),
    ('classifier', nb_classifier)
])

# Fit the pipeline on the training data
pipeline.fit(X_train['processed_review'], y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test['processed_review'])

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")


Accuracy: 0.8271589328612973


#### Logistic Regression

In [218]:
# Split the data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
 
# Define the pipeline
model = make_pipeline(
    CountVectorizer(),
    StandardScaler(with_mean=False),
    LogisticRegression(random_state=42)
)

# Fit the model to the training data
model.fit(X_train['processed_review'], y_train)

# Predict the sentiment of the test data
y_pred = model.predict(X_test['processed_review'])

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8883550385636616


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### Gradient Boosting

In [219]:
# Split the data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
 
# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('gb', GradientBoostingClassifier())
])

# Fit the pipeline on the training data
pipeline.fit(X_train['processed_review'], y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test['processed_review'])

# Calculate the accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.3f}")


Accuracy: 0.834


In [220]:
# Split the data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
 
# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', SVC())
])

# Fit the pipeline on the training data
pipeline.fit(X_train['processed_review'], y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test['processed_review'])

# Calculate the accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.3f}")


Accuracy: 0.923


In [221]:
# Split the data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
 

# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', SVC())
])


# Define the hyperparameter grid
param_grid = {
    'svm__C': [0.1, 1, 10, 100,200],
    'svm__kernel': ['linear', 'rbf', 'sigmoid','precomputed','poly'],
    'svm__gamma': ['scale', 'auto','float','array-like']
}


# Define the GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the pipeline on the training data
grid_search.fit(X_train['processed_review'], y_train)

# Print the best hyperparameters
print("Best parameters: ", grid_search.best_params_)

# Predict on the test data using the best estimator
y_pred = grid_search.best_estimator_.predict(X_test['processed_review'])

# Calculate the accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.3f}")


Fitting 5 folds for each of 100 candidates, totalling 500 fits


exception calling callback for <Future at 0x24ae666ab80 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "c:\Users\98936\anaconda3\lib\site-packages\joblib\externals\loky\_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "c:\Users\98936\anaconda3\lib\site-packages\joblib\parallel.py", line 359, in __call__
    self.parallel.dispatch_next()
  File "c:\Users\98936\anaconda3\lib\site-packages\joblib\parallel.py", line 794, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "c:\Users\98936\anaconda3\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "c:\Users\98936\anaconda3\lib\site-packages\joblib\parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "c:\Users\98936\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 531, in apply_async
    future = self._workers.submit(SafeFunction(func))


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.


In [222]:
# Split the data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
 
# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', SVC(C= 100, gamma= 'scale', kernel='rbf'))
])

# Fit the pipeline on the training data
pipeline.fit(X_train['processed_review'], y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test['processed_review'])

# Calculate the accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.3f}")

Accuracy: 0.930


In [224]:
from sklearn.calibration import CalibratedClassifierCV

# Split the data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', SVC(C= 100, gamma= 'scale', kernel='rbf', probability=True)) # set probability to True
])

# Wrap the pipeline in a calibrated classifier
calibrated_pipeline = CalibratedClassifierCV(pipeline, method='sigmoid', cv=5) # you can choose different calibration methods

# Fit the pipeline on the training data
calibrated_pipeline.fit(X_train['processed_review'], y_train)

# Predict on the test data
y_pred = calibrated_pipeline.predict(X_test['processed_review'])

# Calculate the accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.3f}")


Accuracy: 0.929
