In [0]:
import pandas as pd
import string
import re
import nltk
import spacy
from sklearn.feature_extraction.text import CountVectorizer

#initialization
pd.set_option('display.max_colwidth', 100)
nltk.download("stopwords")
nltk.download("words")
nltk.download('wordnet')
stopwords = nltk.corpus.stopwords.words('english')
englishwords = nltk.corpus.words.words()

spacy.load('en_core_web_sm')

spacy_nlp = spacy.load('en_core_web_sm')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

for word in spacy_stopwords:
  if(word not in stopwords):
    stopwords.append(word)

#import data
#data = pd.read_csv("amazon_reviews_us_Mobile_Electronics_v1.tsv", sep='\t')

#data = data[['product_title', 'product_category', 'review_headline', 'review_body', 'star_rating']]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
#%%negation word list
negationwords = ["no", "not", "none", "no one", "nobody", "nothing", "neither", "nowhere", "never", "hardly", "scarcely", "barely", "doesnt", "isnt", "wasnt", "shouldnt", "wouldnt", "couldnt", "wont", "cant", "dont", "didnt"]
new_stopwords = []

#create unimportant words from the corpus
unimportant_words = []
data['product_title'].apply(lambda x: create_unimportantwords(x))
print(len(unimportant_words))

#load lexicon words
lexiconwords = []

with open('/content/lexiconwords.txt', 'r') as filehandle:
    filecontents = filehandle.readlines()
    for line in filecontents:
        current_place = line[:-1]
        lexiconwords.append(current_place)

unimportant_words = [word for word in unimportant_words if word not in lexiconwords]
print(len(unimportant_words))

In [0]:
#%% functions

def remove_punctuation(text):
    cleaned_text = "".join([char for char in text if char not in string.punctuation])
    return cleaned_text

def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

def remove_stopwords(tokens):
    new_stopwords = [word for word in stopwords if word not in negationwords]
    new_stopwords.extend(unimportant_words)
    required_tokens = [word for word in tokens if word not in new_stopwords]
    return required_tokens

def keeponly_alpha(tokens):
    aplha_tokens = [word for word in tokens if word.isalpha()]
    return aplha_tokens

def remove_nonenglish(tokens):
    english_tokens = [word for word in tokens if word in englishwords]
    return english_tokens

def lemmatize(tokens):
    lemmatized_tokens = [nltk.WordNetLemmatizer().lemmatize(word) for word in tokens]
    return lemmatized_tokens

def preprocess_text(text):
    
    text_nopunctuation = remove_punctuation(text)
    
    text_tokens = tokenize(text_nopunctuation.lower())
    
    tokens_nostopwords = remove_stopwords(text_tokens)
    
    tokens_aplha = keeponly_alpha(tokens_nostopwords)
    
    #tokens_english = remove_nonenglish(tokens_aplha)
    
    tokens_lemmatized = lemmatize(tokens_aplha)
    
    return tokens_lemmatized

def generate_label(rating):
    if(rating >= 3):
        return 1 #positive
    else:
        return 0 #negative
    
def word_frequency(text):
    for word in text:
        if word not in wordfrequency:
            wordfrequency[word] = 1
        else:
            wordfrequency[word] += 1
            
def create_less_frequent_list():
    for word in wordfrequency:
        if(wordfrequency.get(word) < 20):
            lessfrequentwords.append(word)     
            
def remove_lessfrequentwords(tokens):
    frequent_tokens = " ".join([word for word in tokens if word not in lessfrequentwords])
    return frequent_tokens

def create_unimportantwords(text):
    text_nopunctuation = remove_punctuation(text)
    text_tokens = tokenize(text_nopunctuation.lower())
    for word in text_tokens:
        if(word not in unimportant_words):
            unimportant_words.append(word)
    

In [0]:
#%% data preprocessing

#remove nan
data.dropna(axis='rows', inplace=True)

#generate label column
data['class'] = data['star_rating'].apply(lambda x: generate_label(x))

#preprocess pipeline
data['review_body_cleaned'] = data['review_body'].apply(lambda x: preprocess_text(x))


In [0]:
#%% feature reduction
wordfrequency = {}
lessfrequentwords = []
data['review_body_cleaned'].apply(lambda x: word_frequency(x))
create_less_frequent_list()

data['review_body_cleaned'] = data['review_body_cleaned'].apply(lambda x: remove_lessfrequentwords(x))

In [0]:
#load preprocessed data

#import data
data = pd.read_csv("/content/data_WithOutProductTitleWords_WithoutLessFrequentWords.tsv", sep='\t')

data = data[['product_title', 'product_category', 'review_headline', 'review_body', 'star_rating', 'review_body_cleaned', 'class']]

data.dropna(axis=0, inplace=True)

data['review_body_cleaned'].isnull().sum()



0

In [0]:
#split data set before vectorization
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size = 0.2, stratify = data['class'])

In [0]:
#vectorize
#count vectorization

count_vectorizer = CountVectorizer(lowercase=False)
count_vectorizer_fit = count_vectorizer.fit(train['review_body_cleaned'])

X_train = count_vectorizer_fit.transform(train['review_body_cleaned'])
X_test = count_vectorizer_fit.transform(test['review_body_cleaned'])

X_train = pd.DataFrame(X_train.toarray())
X_train.columns = count_vectorizer_fit.get_feature_names()

X_test = pd.DataFrame(X_test.toarray())
X_test.columns = count_vectorizer_fit.get_feature_names()

In [0]:
print(X_train.shape)
print(X_test.shape)

(81496, 3700)
(20374, 3700)


In [0]:
#undersampling 
from imblearn.under_sampling import RandomUnderSampler
                                     
randomundsamplr = RandomUnderSampler(random_state=0, replacement=True)
sampled_X_df, sampled_class = randomundsamplr.fit_sample(X_df, data['class'])

print(sampled_X_df.shape)

In [0]:
#using Naive Bayes classfier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

classifier_NB = GaussianNB()
#scores = cross_val_score(classifier, X_train, train['class'], cv=5, scoring='accuracy')
#print(scores)

predicted = classifier_NB.fit(X_train, train['class']).predict(X_test)
accuracy_score(test['class'], predicted)

0.4335918327279866

In [0]:
X_test.columns[X_test.loc[0].values==1]


Index(['customer', 'great', 'outstanding', 'perfect', 'perfection', 'problem',
       'protect', 'smooth', 'work'],
      dtype='object')

In [0]:
test = count_vectorizer_fit.transform(['like it'])
test = pd.DataFrame(test.toarray())
test.columns = count_vectorizer_fit.get_feature_names()


predicted = classifier_NB.predict(test)

print(predicted)



[0]


In [0]:
#using Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

classifier_RF = RandomForestClassifier(n_jobs=-1)
#scores_RF = cross_val_score(classifier_RF, X_df, data['class'], cv=5, scoring='accuracy')

predicted = classifier_RF.fit(X_train, train['class']).predict(X_test)
accuracy_score(test['class'], predicted)


0.8501030725434378

In [0]:
test = count_vectorizer_fit.transform(['product is bad'])
test = pd.DataFrame(test.toarray())
test.columns = count_vectorizer_fit.get_feature_names()

predicted = classifier_RF.predict(test)
print(predicted)

[0]


In [0]:
 #using SVM Classifier
from sklearn import svm
classifier_SVM = svm.SVC(kernel='linear', gamma='scale')

#scores_SVM = cross_val_score(classifier_SVM, X_df, data['class'], cv=5, scoring='accuracy')
#print(scores_SVM) 

predicted = classifier_SVM.fit(X_train, train['class']).predict(X_test)
accuracy_score(test['class'], predicted)

In [0]:
#ngram vectorization
count_vectorizer_ngram = CountVectorizer(ngram_range=(1,2), lowercase=False, max_features=13000)
count_vectorizer_ngram_fit = count_vectorizer_ngram.fit(train['review_body_cleaned'])

X_train = count_vectorizer_ngram_fit.transform(train['review_body_cleaned'])
X_test = count_vectorizer_ngram_fit.transform(test['review_body_cleaned'])

X_train = pd.DataFrame(X_train.toarray())
X_train.columns = count_vectorizer_ngram_fit.get_feature_names()

X_test = pd.DataFrame(X_test.toarray())
X_test.columns = count_vectorizer_ngram_fit.get_feature_names()

In [0]:
X_train.shape

In [0]:
#using Naive Bayes classfier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

classifier = GaussianNB()
#scores = cross_val_score(classifier, X_train, train['class'], cv=5, scoring='accuracy')
#print(scores)

predicted = classifier.fit(X_train, train['class']).predict(X_test)
accuracy_score(test['class'], predicted)

In [0]:
#using Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

classifier_RF = RandomForestClassifier(n_jobs=-1)
scores_RF = cross_val_score(classifier_RF, X_df, data['class'], cv=5, scoring='accuracy')

print(scores_RF)


In [0]:
#using SVM Classifier
from sklearn import svm
classifier_SVM = svm.SVC()

scores_SVM = cross_val_score(classifier_SVM, X_df, data['class'], cv=5, scoring='accuracy')

print(scores_SVM)