In [None]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split


import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns


from sklearn.feature_extraction.text import  TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string


from sklearn.naive_bayes import MultinomialNB


from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import warnings
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
warnings.filterwarnings('ignore')
sns.set_style("darkgrid")
%matplotlib inline

In [None]:
df = pd.read_csv('/kaggle/input/amazon-music-reviews/Musical_instruments_reviews.csv')
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
# Remove unused columns
df.drop(['reviewerID', 'asin', 'reviewerName', 'helpful', 'unixReviewTime', 'reviewTime'],axis=1,inplace=True)
df.head()

In [None]:
# Combaine reviewText and summaries 
df['text'] = df['reviewText'] + ' ' + df['summary']
df.drop(['reviewText', 'summary'],axis=1,inplace=True)
df.head()

In [None]:
# Convert overall into 1 or 0 , overall  < = 3 = 0 and overall > 3 = 1 
df.overall = df.overall.apply(lambda x :0  if int(x) <= 3 else 1 )

In [None]:
df.head()

In [None]:
# Split bad and good reviews   
good = df[df['overall'] == 1].text
bad  = df[df['overall'] == 0].text

In [None]:
# Text reviews of  bad ratings
plt.figure(figsize = (20,20)) 
worldcould_bad = WordCloud(min_font_size = 3,  max_words = 3000 , width = 1600 , height = 680).generate(" ".join(bad))
plt.imshow(worldcould_bad,interpolation = 'bilinear')
plt.grid(None)

In [None]:
# Visualize rating
plt.figure(figsize = (10,5))
print(df.overall.value_counts())
print('*' * 40)
sns.countplot(x = 'overall', data = df)
plt.show()

In [None]:
df.text = df.text.astype('str')

In [None]:
# Cleaning the text (Lemmatize , remove stopwords , lower case )
lemmatizer = WordNetLemmatizer()
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

            
def lemm_fun(text):
    final_text = []
    for word in text.split():
        if word.lower() not in stop:
            lem = lemmatizer.lemmatize(word)
            final_text.append(lem.lower())
    return " ".join(final_text)


            
df.text = df.text.apply(lemm_fun)

In [None]:
X = df['text']
y = df['overall']

In [None]:
# TFIDF text  
from sklearn.feature_extraction.text import TfidfVectorizer
   
tf=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,2))
  
x=tf.fit_transform(X)  

In [None]:
# Hlande dataset  imbalance 
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42 , sampling_strategy = 1)
x_sm,y_sm = smote.fit_resample(x,y)

y_sm.value_counts()

In [None]:
# split data into train, test
x_train,x_test,y_train,y_test = train_test_split(x_sm, y_sm, test_size = 0.3)

In [None]:
# Naive Bayes
nb = MultinomialNB()

# Model
nb_model = nb.fit(x_train, y_train)

# Predict
nb_predict = nb.predict(x_test)

# Accuracy
nb_acc = accuracy_score(y_test,nb_predict)

print('nb test accuracy:', nb_acc)

In [None]:
# Precision , Recall , F1-score
cr = classification_report(y_test, nb_predict)

In [None]:
print(cr)

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test,nb_predict)

In [None]:
print(cm)