In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gzip
import nltk
import spacy
import string
import re
import warnings

from nltk.corpus import stopwords
from nltk.collocations import *
from collections import Counter
from nltk.stem import porter
from wordcloud import WordCloud, STOPWORDS
from nltk.tokenize import word_tokenize


nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('genesis')
nltk.download('averaged_perceptron_tagger')

stop_words = set(stopwords.words('english'))
stemmer = porter.PorterStemmer()

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

nlp = spacy.load("en_core_web_sm")

nltk.download('punkt')
nltk.download('brown')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\98936\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\98936\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package genesis to
[nltk_data]     C:\Users\98936\AppData\Roaming\nltk_data...
[nltk_data]   Package genesis is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\98936\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\98936\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\98936\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [2]:
# Importing the dataset
url = 'https://raw.githubusercontent.com/shahrzadko/NLPDigitalMusic/main/Digital_Music_5.json?token=GHSAT0AAAAAAB6LOL42SKHPCCMIVYPOC4REY7AKE2Q'
df_raw = pd.read_json(url)
full_df = pd.DataFrame.from_records(df_raw['data'])

In [3]:
#creating Sentiment column with overall rating
full_df['Sentiment'] = np.where((full_df['overall'] > 3), 'Positive', 
                          np.where((full_df['overall'] < 3), 'Negative', 'Neutral'))


In [4]:
#showing that the data is not balanced
full_df['Sentiment'].value_counts() 

Positive    158985
Neutral       6792
Negative      4004
Name: Sentiment, dtype: int64

In [5]:
#creating  'NewReview'  column using  'reviewText' and 'summary' columns
cols = ['reviewText','summary']
full_df['NewReview'] = full_df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
full_df['NewReview'] = full_df['NewReview'].astype(str)
full_df['review_length'] = full_df['NewReview'].apply(lambda x: len(x.split()))

In [6]:
music_df_sample = full_df.drop(columns=['vote','image','reviewTime','style','reviewerName','unixReviewTime','summary','reviewText'])

In [7]:
#dropping rows if the review is not verified
music_df_sample = music_df_sample[music_df_sample.verified == True]

#dropping rows if the reviewer generally gives very negative reviews
df_filtered = music_df_sample.groupby('reviewerID').filter(lambda x: not((x['overall'].count() >= 10) and (x['overall'].mean() <= 2)))

#dropping rows if the review is too short
#df_filtered = df_filtered[df_filtered['review_length'] >= 6]


In [8]:
#checking the values of Sentiment to creat a sample data
df_filtered['Sentiment'].value_counts() 

Positive    141104
Neutral       5396
Negative      2093
Name: Sentiment, dtype: int64

In [9]:
#creating sample using equal number of each sentiment
sample_size = min(df_filtered['Sentiment'].value_counts())
df_sample = df_filtered.groupby('Sentiment').apply(lambda x: x.sample(sample_size)).reset_index(drop=True)

In [10]:
df_sample['Sentiment'].value_counts() 

Negative    2093
Neutral     2093
Positive    2093
Name: Sentiment, dtype: int64

In [11]:
df_sample.head()

Unnamed: 0,overall,verified,reviewerID,asin,Sentiment,NewReview,review_length
0,1.0,True,A16X18QH3TK8GV,B000W1W96K,Negative,This was a replacement CD and made the same er...,32
1,1.0,True,A56XVP4BQFWRC,B001KWGTRU,Negative,never ordered it One Star,5
2,2.0,True,AAHGICDEAR5AQ,B00137GGZC,Negative,It was an odd song... Two Stars,7
3,1.0,True,A357H3Y4DBHFJ7,B003Y3XTJO,Negative,Was a gift not my kind of music One Star,10
4,1.0,True,A3DG93E8TXMKZF,B001KOWH0G,Negative,This is NOT a piece of music.\n\nIt is only us...,49


In [12]:
#drop non-English comments
from langdetect import detect

def is_english(text):
    try:
        lang = detect(text)
        return lang == 'en'
    except:
        return False

df_sample = df_sample[df_sample['NewReview'].apply(is_english)]


In [13]:
#checking the values of Sentiment after removing non-English reviews
df_sample['Sentiment'].value_counts() 

Neutral     2029
Positive    1981
Negative    1903
Name: Sentiment, dtype: int64

In [14]:
df_sample = df_sample.drop(columns=['verified','overall','asin','reviewerID'])

In [15]:
df_sample.head()

Unnamed: 0,Sentiment,NewReview,review_length
0,Negative,This was a replacement CD and made the same er...,32
2,Negative,It was an odd song... Two Stars,7
3,Negative,Was a gift not my kind of music One Star,10
4,Negative,This is NOT a piece of music.\n\nIt is only us...,49
5,Negative,GIFT PURCHASE Two Stars,4


In [16]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string

# Initialize stopwords
STOPWORDS = set(stopwords.words('english'))

# Convert NewReview to lowercase
df_sample['NewReview'] = df_sample['NewReview'].str.lower()

# Remove punctuation from NewReview
#df_sample['NewReview'] = df_sample['NewReview'].str.translate(str.maketrans("", "", string.punctuation))

# Remove URLs from NewReview
df_sample['NewReview'] = df_sample['NewReview'].str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', '', regex=True)

# Remove currency symbols from NewReview
df_sample['NewReview'] = df_sample['NewReview'].str.replace(r'£|\$', '', regex=True)

# Remove phone numbers from NewReview
df_sample['NewReview'] = df_sample['NewReview'].str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', '', regex=True)

# Remove digits from NewReview
df_sample['NewReview'] = df_sample['NewReview'].str.replace(r'\d+', '', regex=True)

# Remove email addresses from NewReview
df_sample['NewReview'] = df_sample['NewReview'].str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', '', regex=True)

# Remove special characters from NewReview
df_sample['NewReview'] = df_sample['NewReview'].str.replace(r'\'|“|’', '', regex=True)

# Remove stopwords from NewReview
df_sample['NewReview'] = df_sample['NewReview'].apply(lambda x: " ".join([word for word in x.split() if word.lower() not in STOPWORDS]))

#remove urls
df_sample['NewReview'] = df_sample['NewReview'].apply(lambda x: '' if pd.isna(x) or not isinstance(x, str) else x)
df_sample['NewReview'] = df_sample['NewReview'].apply(lambda x: re.sub(r'http\S+', '', x))

In [17]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# define a function to perform lemmatization, stemming, and tokenization
def process_text(text):
    # tokenize the text into words
    words = word_tokenize(text.lower())
    
    # apply lemmatization and stemming to each word
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    stemmed_words = [stemmer.stem(word) for word in lemmatized_words]
    
    # return the list of processed words as a string
    return ' '.join(stemmed_words)

# apply the function to the NewReview column
df_sample['processed_review'] = df_sample['NewReview'].apply(process_text)

In [18]:
df_sample.head()

Unnamed: 0,Sentiment,NewReview,review_length,processed_review
0,Negative,replacement cd made error time. time asking re...,32,replac cd made error time . time ask refund . ...
2,Negative,odd song... two stars,7,odd song ... two star
3,Negative,gift kind music one star,10,gift kind music one star
4,Negative,piece music. used tuning orchestra instrument....,49,piec music . use tune orchestra instrument . u...
5,Negative,gift purchase two stars,4,gift purchas two star


In [19]:
df_sample = df_sample.drop(columns=['NewReview'])

In [20]:
#handeling negation 
import re

# Define a function to handle negation in a sentence
def handle_negation(sentence):
    # Define a list of negation words
    negation_words = ['not', 'no', 'never', 'neither', 'nor']
    # Split the sentence into words
    words = sentence.split()
    # Initialize a flag to keep track of negation
    negation = False
    # Iterate over the words and add "NOT_" prefix to words that follow negation words
    for i in range(len(words)):
        if negation:
            words[i] = "NOT_" + words[i]
        if words[i].lower() in negation_words:
            negation = not negation
        if re.search('[.?!,;:]', words[i]):
            negation = False
    # Join the words back into a sentence
    return ' '.join(words)
    
# Apply the function to the processed_review column
df_sample['processed_review'] = df_sample['processed_review'].apply(handle_negation)

In [21]:
df_sample.head()

Unnamed: 0,Sentiment,review_length,processed_review
0,Negative,32,replac cd made error time . time ask refund . ...
2,Negative,7,odd song ... two star
3,Negative,10,gift kind music one star
4,Negative,49,piec music . use tune orchestra instrument . u...
5,Negative,4,gift purchas two star


In [22]:
X = df_sample.drop(columns=['Sentiment'])
y = df_sample['Sentiment']

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [25]:
# Split the data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

TfidfVectorizer / Naive Bayes

In [26]:


# Define the features and target
X = df_sample[['processed_review', 'review_length']]
y = df_sample['Sentiment']

# Split the data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

# Fit the pipeline on the training data
pipeline.fit(X_train['processed_review'], y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test['processed_review'])

# Calculate the accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.3f}")


Accuracy: 0.792


Count Vectorizer / Naive Bayse

In [29]:

# Define CountVectorizer
count_vectorizer = CountVectorizer()

# Define Naive Bayes classifier
nb_classifier = MultinomialNB()

# Combine CountVectorizer and Naive Bayes classifier into a pipeline
pipeline = Pipeline([
    ('vectorizer', count_vectorizer),
    ('classifier', nb_classifier)
])

# Fit the pipeline on the training data
pipeline.fit(X_train['processed_review'], y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test['processed_review'])

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")


Accuracy: 0.7886728655959425


In [30]:
# Define the pipeline
model = make_pipeline(
    CountVectorizer(),
    StandardScaler(with_mean=False),
    LogisticRegression(random_state=42)
)

# Fit the model to the training data
model.fit(X_train['processed_review'], y_train)

# Predict the sentiment of the test data
y_pred = model.predict(X_test['processed_review'])

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7709213863060017


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
