In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gzip
import nltk
import spacy
import string
import re
import warnings
import emoji

 
from nltk.corpus import stopwords
from nltk.collocations import *
from collections import Counter
from nltk.stem import porter
from wordcloud import WordCloud, STOPWORDS
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC







nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('genesis')
nltk.download('averaged_perceptron_tagger')

stop_words = set(stopwords.words('english'))
stemmer = porter.PorterStemmer()

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

nlp = spacy.load("en_core_web_sm")

nltk.download('punkt')
nltk.download('brown')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\98936\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\98936\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package genesis to
[nltk_data]     C:\Users\98936\AppData\Roaming\nltk_data...
[nltk_data]   Package genesis is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\98936\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\98936\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\98936\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [2]:
# Importing the dataset
url = 'https://raw.githubusercontent.com/shahrzadko/NLPDigitalMusic/main/Digital_Music_5.json?token=GHSAT0AAAAAAB6LOL42SKHPCCMIVYPOC4REY7AKE2Q'
df_raw = pd.read_json(url)
full_df = pd.DataFrame.from_records(df_raw['data'])

In [3]:
#creating Sentiment column with overall rating
full_df['Sentiment'] = np.where((full_df['overall'] > 3), 'Positive', 
                          np.where((full_df['overall'] < 3), 'Negative', 'Neutral'))


In [4]:
#showing that the data is not balanced
full_df['Sentiment'].value_counts() 

Positive    158985
Neutral       6792
Negative      4004
Name: Sentiment, dtype: int64

In [5]:
#creating  'NewReview'  column using  'reviewText' and 'summary' columns
cols = ['reviewText','summary']
full_df['NewReview'] = full_df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
full_df['NewReview'] = full_df['NewReview'].astype(str)

In [6]:
df_filtered = full_df.drop(columns=['vote','image','reviewTime','style','reviewerName','unixReviewTime','summary','reviewText'])

In [7]:
#dropping rows if the review is not verified
df_filtered = df_filtered[df_filtered.verified == True]

#dropping rows if the reviewer generally gives very negative reviews
df_filtered = df_filtered.groupby('reviewerID').filter(lambda x: not((x['overall'].count() >= 10) and (x['overall'].mean() <= 2)))

#dropping rows if the review is too short
#full_df['review_length'] = full_df['NewReview'].apply(lambda x: len(x.split()))
#df_filtered = df_filtered[df_filtered['review_length'] >= 6]


In [8]:
#checking the values of Sentiment to creat a sample data
df_filtered['Sentiment'].value_counts() 

Positive    141104
Neutral       5396
Negative      2093
Name: Sentiment, dtype: int64

In [9]:
import pandas as pd
import nltk
from nltk.corpus import wordnet


# Reduce the number of positive sentiment rows to 5000
df_pos = df_filtered[df_filtered['Sentiment'] == 'Positive'].sample(n=10000, random_state=42)

# Split the remaining dataset by sentiment
df_neutral = df_filtered[df_filtered['Sentiment'] == 'Neutral']
df_neg = df_filtered[df_filtered['Sentiment'] == 'Negative']

# Define function for synonym replacement
def synonym_replacement(text):
    words = nltk.word_tokenize(text)
    new_words = []
    for word in words:
        synonyms = []
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonyms.append(lemma.name())
        if synonyms:
            new_word = synonyms[0]
        else:
            new_word = word
        new_words.append(new_word)
    new_text = ' '.join(new_words)
    return new_text

# Oversample minority classes using synonym replacement
desired_balance = 0.9
while len(df_neutral) / len(df_pos) < desired_balance or len(df_neg) / len(df_pos) < desired_balance:
    if len(df_neutral) / len(df_pos) < desired_balance:
        sample = df_neutral.sample(n=1)['NewReview'].iloc[0]
        new_review = synonym_replacement(sample)
        df_neutral = df_neutral.append({'NewReview': new_review, 'Sentiment': 'Neutral'}, ignore_index=True)
    if len(df_neg) / len(df_pos) < desired_balance:
        sample = df_neg.sample(n=1)['NewReview'].iloc[0]
        new_review = synonym_replacement(sample)
        df_neg = df_neg.append({'NewReview': new_review, 'Sentiment': 'Negative'}, ignore_index=True)


# Combine oversampled subsets back into a single dataframe
df_balanced = pd.concat([df_pos, df_neutral, df_neg], ignore_index=True)

# Shuffle the dataframe
df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)


In [10]:
df_balanced['Sentiment'].value_counts() 

Positive    10000
Negative     9000
Neutral      9000
Name: Sentiment, dtype: int64

In [11]:
#creating balanced data using equal number of each sentiment
#sample_size = min(df_filtered['Sentiment'].value_counts())
#df_balanced = df_filtered.groupby('Sentiment').apply(lambda x: x.sample(sample_size)).reset_index(drop=True)

In [12]:
#df_balanced['Sentiment'].value_counts() 

In [13]:
df_balanced.head()

Unnamed: 0,overall,verified,reviewerID,asin,Sentiment,NewReview,review_length
0,5.0,True,AEZNE7TR0CHFI,B00137KHK2,Positive,"If you're a fan of Kix Brooks and Ronnie Dunn,...",51.0
1,5.0,True,A3UE57S8G2G72Y,B006ONY954,Positive,It really is! I highly recommend it to all. A ...,12.0
2,5.0,True,A3VP9JGZNO78JP,B008MJC3QU,Positive,love it Five Stars,4.0
3,5.0,True,A1PQ250ENJ47EF,B0059H09DC,Positive,Heard Adam perform this song on the 'Victoria ...,59.0
4,,,,,Negative,not my kind of music one star,


In [14]:
#drop non-English comments
from langdetect import detect

def is_english(text):
    try:
        lang = detect(text)
        return lang == 'en'
    except:
        return False

df_balanced = df_balanced[df_balanced['NewReview'].apply(is_english)]


In [15]:
#checking the values of Sentiment after removing non-English reviews
df_balanced['Sentiment'].value_counts() 

Positive    9458
Neutral     8725
Negative    8121
Name: Sentiment, dtype: int64

In [17]:
df_balanced = df_balanced.drop(columns=['verified','asin','reviewerID'])

In [18]:
df_balanced.head()

Unnamed: 0,overall,Sentiment,NewReview,review_length
0,5.0,Positive,"If you're a fan of Kix Brooks and Ronnie Dunn,...",51.0
1,5.0,Positive,It really is! I highly recommend it to all. A ...,12.0
2,5.0,Positive,love it Five Stars,4.0
3,5.0,Positive,Heard Adam perform this song on the 'Victoria ...,59.0
4,,Negative,not my kind of music one star,


In [23]:


# Initialize stopwords
STOPWORDS = set(stopwords.words('english'))
# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Define function for text cleaning
def clean_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove currency symbols
    text = re.sub(r'£|\$', '', text)

    # Remove phone numbers
    text = re.sub(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', '', text)

    # Remove digits from NewReview
    df_balanced['NewReview'] = df_balanced['NewReview'].apply(lambda x: re.sub(r'\d+', '', x))

    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # Remove punctuation and special characters except emojis
    text = re.sub(r'[^\w\s'  + ']', '', text)

    # Tokenize text into words
    words = nltk.word_tokenize(text)
    
    # Remove stopwords and handle negation
    negation_words = ['not', 'no', 'never', 'neither', 'nor']
    words = [word if word.lower() in negation_words else word.lower() for word in words]
    words = [word for word in words if word.lower() not in STOPWORDS]
    words = [f'NOT_{words[i+1]}' if (i < len(words)-1 and words[i].lower() in negation_words and words[i+1] not in string.punctuation) else words[i] for i in range(len(words))]
    
    # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]

    # Stem words
    words = [stemmer.stem(word) for word in words]

    # Join words back into text
    cleaned_text = ' '.join(words)
    return cleaned_text


In [24]:
df_balanced['processed_review'] = df_balanced['NewReview'].apply(clean_text)

In [25]:
df_balanced.head()

Unnamed: 0,overall,Sentiment,NewReview,review_length,processed_review
0,5.0,Positive,"If you're a fan of Kix Brooks and Ronnie Dunn,...",51.0,your fan kix brook ronni dunn must song part w...
1,5.0,Positive,It really is! I highly recommend it to all. A ...,12.0,realli highli recommend beauti album
2,5.0,Positive,love it Five Stars,4.0,love five star
3,5.0,Positive,Heard Adam perform this song on the 'Victoria ...,59.0,heard adam perform song victoria secret specia...
4,,Negative,not my kind of music one star,,kind music one star


In [None]:
df_balanced = df_balanced.drop(columns=['NewReview'])

In [None]:
df_balanced.head()

Unnamed: 0,Sentiment,review_length,processed_review
0,Positive,8.0,love thia song rest in peac five star
1,Neutral,,should rich_person listen to the whole thing ....
2,Negative,,poor_peopl sound qualiti two star
3,Positive,6.0,excel song thank you five star
4,Negative,,iodin never buy thi ! ! ! ! ! ! ! one star


In [215]:
X = pd.DataFrame(df_balanced['processed_review'])
y = df_balanced['Sentiment']

TfidfVectorizer / Naive Bayes

In [216]:
# Split the data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

# Fit the pipeline on the training data
pipeline.fit(X_train['processed_review'], y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test['processed_review'])

# Calculate the accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.3f}")


Accuracy: 0.846


Count Vectorizer / Naive Bayse

In [217]:
# Split the data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
 
# Define CountVectorizer
count_vectorizer = CountVectorizer()

# Define Naive Bayes classifier
nb_classifier = MultinomialNB()

# Combine CountVectorizer and Naive Bayes classifier into a pipeline
pipeline = Pipeline([
    ('vectorizer', count_vectorizer),
    ('classifier', nb_classifier)
])

# Fit the pipeline on the training data
pipeline.fit(X_train['processed_review'], y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test['processed_review'])

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")


Accuracy: 0.8271589328612973


#### Logistic Regression

In [218]:
# Split the data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
 
# Define the pipeline
model = make_pipeline(
    CountVectorizer(),
    StandardScaler(with_mean=False),
    LogisticRegression(random_state=42)
)

# Fit the model to the training data
model.fit(X_train['processed_review'], y_train)

# Predict the sentiment of the test data
y_pred = model.predict(X_test['processed_review'])

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8883550385636616


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### Gradient Boosting

In [219]:
# Split the data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
 
# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('gb', GradientBoostingClassifier())
])

# Fit the pipeline on the training data
pipeline.fit(X_train['processed_review'], y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test['processed_review'])

# Calculate the accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.3f}")


Accuracy: 0.834


In [220]:
# Split the data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
 
# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', SVC())
])

# Fit the pipeline on the training data
pipeline.fit(X_train['processed_review'], y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test['processed_review'])

# Calculate the accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.3f}")


Accuracy: 0.923


In [221]:
# Split the data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
 

# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', SVC())
])


# Define the hyperparameter grid
param_grid = {
    'svm__C': [0.1, 1, 10, 100,200],
    'svm__kernel': ['linear', 'rbf', 'sigmoid','precomputed','poly'],
    'svm__gamma': ['scale', 'auto','float','array-like']
}


# Define the GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the pipeline on the training data
grid_search.fit(X_train['processed_review'], y_train)

# Print the best hyperparameters
print("Best parameters: ", grid_search.best_params_)

# Predict on the test data using the best estimator
y_pred = grid_search.best_estimator_.predict(X_test['processed_review'])

# Calculate the accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.3f}")


Fitting 5 folds for each of 100 candidates, totalling 500 fits


exception calling callback for <Future at 0x24ae666ab80 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "c:\Users\98936\anaconda3\lib\site-packages\joblib\externals\loky\_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "c:\Users\98936\anaconda3\lib\site-packages\joblib\parallel.py", line 359, in __call__
    self.parallel.dispatch_next()
  File "c:\Users\98936\anaconda3\lib\site-packages\joblib\parallel.py", line 794, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "c:\Users\98936\anaconda3\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "c:\Users\98936\anaconda3\lib\site-packages\joblib\parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "c:\Users\98936\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 531, in apply_async
    future = self._workers.submit(SafeFunction(func))


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.


In [222]:
# Split the data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
 
# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', SVC(C= 100, gamma= 'scale', kernel='rbf'))
])

# Fit the pipeline on the training data
pipeline.fit(X_train['processed_review'], y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test['processed_review'])

# Calculate the accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.3f}")

Accuracy: 0.930


In [224]:
from sklearn.calibration import CalibratedClassifierCV

# Split the data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', SVC(C= 100, gamma= 'scale', kernel='rbf', probability=True)) # set probability to True
])

# Wrap the pipeline in a calibrated classifier
calibrated_pipeline = CalibratedClassifierCV(pipeline, method='sigmoid', cv=5) # you can choose different calibration methods

# Fit the pipeline on the training data
calibrated_pipeline.fit(X_train['processed_review'], y_train)

# Predict on the test data
y_pred = calibrated_pipeline.predict(X_test['processed_review'])

# Calculate the accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.3f}")


Accuracy: 0.929


In [None]:
import pandas as pd
import nltk.sentiment.vader as vader

# Load the review data into a DataFrame
reviews_df = 

# Define a function to perform sentiment analysis on the review text
analyzer = vader.SentimentIntensityAnalyzer()
def analyze_sentiment(text):
    return analyzer.polarity_scores(text)['compound']

# Perform sentiment analysis on the review text
reviews_df['sentiment'] = reviews_df['Review'].apply(analyze_sentiment)

# Calculate the average sentiment score for each product
product_sentiment_df = reviews_df.groupby('Product')['sentiment'].mean().reset_index()

# Merge the sentiment scores with the product ratings
product_ratings_df = pd.merge(product_ratings_df, product_sentiment_df, on='Product')

# Calculate the adjustment factor based on the average sentiment score
product_ratings_df['adjustment'] = product_ratings_df['sentiment'].apply(lambda x: 0.5*(x+1) if x>=0 else 0.5*(x+1)/2)

# Adjust the ratings based on the sentiment score
product_ratings_df['adjusted_rating'] = product_ratings_df['Rank'] + product_ratings_df['adjustment']

# Save the adjusted ratings to a CSV file
product_ratings_df.to_csv('adjusted_ratings.csv', index=False)
