In [None]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import re
import string
import nltk
import spacy
import gensim
from spacy import displacy
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer, PorterStemmer, SnowballStemmer
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from nltk.probability import FreqDist
from sklearn.svm import LinearSVC
from textblob import TextBlob
from gensim.models import Word2Vec

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
reviews_df = pd.read_csv('/kaggle/input/indian-products-on-amazon/amazon_vfl_reviews.csv', encoding="UTF-8")
reviews_df.head()

In [None]:
reviews_df.shape

In [None]:
# any null columns
reviews_df.isnull().sum()

In [None]:
# the review column, four rows without review text, we drop the rows with the null columns
reviews_df = reviews_df.dropna()
#resetting the index
reviews_df = reviews_df.reset_index(drop=True)

In [None]:
# remove all characters not number or characters
def cleanText(input_string):
    modified_string = re.sub('[^A-Za-z0-9]+', ' ', input_string)
    return(modified_string)
reviews_df['review'] = reviews_df.review.apply(cleanText)
reviews_df['review'][150]

In [None]:
# From the name we extract the brand
reviews_df['brandName'] = reviews_df['name'].str.split('-').str[0]
reviews_df.head()

In [None]:
reviews_df['brandName'].value_counts()

In [None]:
reviews_df['brandName'] = reviews_df['brandName'].str.title()
reviews_df.brandName.unique()

In [None]:
# Extracting the product from the name column
products = []
for value in reviews_df['name']:
    indx = len(value.split('-')[0])+1
    products.append(value[indx:])
reviews_df['product'] = products
reviews_df['product'].unique()

In [None]:
reviews_df.head()

Text analysis with NLTK and Vader Sentiment analyzer

In [None]:
#converting to lower case
reviews_df['clean_review_text']=reviews_df['review'].str.lower()

In [None]:
#removing punctuations
reviews_df['clean_review_text']=reviews_df['clean_review_text'].str.translate(str.maketrans('','',string.punctuation))

In [None]:
stopWords=stopwords.words('english')+['the', 'a', 'an', 'i', 'he', 'she', 'they', 'to', 'of', 'it', 'from']
def removeStopWords(stopWords, rvw_txt):
    newtxt = ' '.join([word for word in rvw_txt.split() if word not in stopWords])
    return newtxt
reviews_df['clean_review_text'] = [removeStopWords(stopWords,x) for x in reviews_df['clean_review_text']]

In [None]:
#splitting text into words
tokenList=[]
for indx in range(len(reviews_df)):
       token=word_tokenize(reviews_df['clean_review_text'][indx])
       tokenList.append(token)
reviews_df['review_tokens'] = tokenList
reviews_df.head()

In [None]:
nltk.download('vader_lexicon')
sentiment_model = SentimentIntensityAnalyzer()
sentiment_scores=[]
sentiment_score_flag = []
for text in reviews_df['clean_review_text']:
        sentimentResults = sentiment_model.polarity_scores(text)
        sentiment_score = sentimentResults["compound"]
        #print(sentimentResults)
        #The compound value reflects the overall sentiment ranging from -1 being very negative and +1 being very positive.
        sentiment_scores.append(sentiment_score)
        # marking the sentiments as positive, negative and neutral 
        if sentimentResults['compound'] >= 0.05 : 
            sentiment_score_flag.append('positive')
  
        elif sentimentResults['compound'] <= - 0.05 : 
            sentiment_score_flag.append('negative')
  
        else : 
            sentiment_score_flag.append('neutral')
            
reviews_df['scores']=sentiment_scores
reviews_df['scoreStatus'] = sentiment_score_flag

In [None]:
reviews_df.head()

In [None]:
def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color = 'white',
        max_words = 200,
        max_font_size = 40, 
        scale = 3,
        random_state = 42
    ).generate(str(data))

    fig = plt.figure(1, figsize = (20, 20))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize = 20)
        fig.subplots_adjust(top = 2.3)

    plt.imshow(wordcloud)
    plt.show()
    
# print wordcloud
show_wordcloud(reviews_df["clean_review_text"])

In [None]:
# print wordcloud
positiveReviews_df =reviews_df.loc[reviews_df['scoreStatus'] == "positive"]
show_wordcloud(positiveReviews_df["clean_review_text"])

In [None]:
## print wordcloud
negativeReviews_df =reviews_df.loc[reviews_df['scoreStatus'] == "negative"]
show_wordcloud(negativeReviews_df["clean_review_text"])

In [None]:
features = CountVectorizer()
features.fit(reviews_df["clean_review_text"])
print(len(features.vocabulary_))
print(features.vocabulary_)

In [None]:
bagofWords = features.transform(reviews_df["clean_review_text"])
print(bagofWords)

In [None]:
print(bagofWords.toarray())

In [None]:
print(features.get_feature_names())

In [None]:
df = pd.concat([positiveReviews_df,negativeReviews_df])
df = df[["clean_review_text","scoreStatus"]]
df['scoreStatus'] = (df['scoreStatus'] == 'positive')*1
X = df["clean_review_text"]
y = df["scoreStatus"]
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [None]:
X_train = features.fit_transform(X_train)
X_test = features.transform(X_test)

In [None]:
#k fold cross validation with k=5
scores = cross_val_score(LogisticRegression(),X_train,y_train,cv=5)
print(np.mean(scores))

In [None]:
model=LogisticRegression()
model.fit(X_train,y_train)
print(model.score(X_train,y_train))
print(model.score(X_test,y_test))

In [None]:
y_pred = model.predict(X_test)
confusion_matrix(y_test,y_pred)

In [None]:
text = "the product great"
model.predict(features.transform([text]))[0]

In [None]:
text = "bad"
model.predict(features.transform([text]))[0]

In [None]:
text = "sucks"
model.predict(features.transform([text]))[0]

In [None]:
text = "very bad"
model.predict(features.transform([text]))[0]

In [None]:
text = "not good"
model.predict(features.transform([text]))[0]

In [None]:
tokenized_word=word_tokenize((reviews_df['clean_review_text'].to_string()))
#Frequency Distribution
fdist = FreqDist(tokenized_word)
# Frequency Distribution Plot
fdist.plot(30,cumulative=False)
plt.show()

Text Analytics with spacy

In [None]:
nlp=spacy.load("en_core_web_sm")

In [None]:
text = reviews_df['review'][120]
doc=nlp(text)
type(doc)
print(doc)

In [None]:
#Tokens
for token in doc:
    print(token.text)

In [None]:
#Stopwords
stopwords=spacy.lang.en.stop_words.STOP_WORDS
stopWords = list(stopwords)
len(stopWords)

In [None]:
for token in doc:
    if token.is_stop == False:
        print(token)

In [None]:
print(len(doc))
doc2=[]
for token in doc:
    if not token.is_stop:
        doc2.append(token)
print(len(doc2))

In [None]:
#lemmatization
for review_text in doc:
    print(review_text.text,review_text.lemma_)

In [None]:
#POS
for token in doc:
   print(token,token.tag_,token.pos_,spacy.explain(token.tag_))

In [None]:
displacy.render(doc,style='dep',jupyter=True,options={'distance':90})

In [None]:
for entity in doc.ents:
    print(entity.text,'---->',entity.label_)
displacy.render(doc,style='ent',jupyter=True)

In [None]:
#word vectors and similarity
#large pre trained model
!python -m spacy download en_core_web_lg

In [None]:
import en_core_web_lg
nlp = en_core_web_lg.load()
doc=nlp(text)
for token in doc:
    print(token.text,'---->',token.has_vector)

In [None]:
for token in doc:
    print(token.text,'',token.vector_norm)

In [None]:
#similarity score
text="eat"
text1="ate"
doc=nlp(text)
doc1=nlp(text1)
doc.similarity(doc1)

In [None]:
#similarity score
text="good"
text1="bad"
doc=nlp(text)
doc1=nlp(text1)
doc.similarity(doc1)

In [None]:
#similarity score
text="hot"
text1="summer"
doc=nlp(text)
doc1=nlp(text1)
doc.similarity(doc1)

In [None]:
#similarity score
text="excellent"
text1="good"
doc=nlp(text)
doc1=nlp(text1)
doc.similarity(doc1)

In [None]:
#similarity score
text="sucks"
text1="bad"
doc=nlp(text)
doc1=nlp(text1)
doc.similarity(doc1)

In [None]:
df = pd.concat([positiveReviews_df,negativeReviews_df])
df = df[["clean_review_text","scoreStatus"]]
df['scoreStatus'] = (df['scoreStatus'] == 'positive')*1

In [None]:
#Tokenization
punct = string.punctuation
print(punct)
def cleanText(sent):
    doc = nlp(sent)
    tokens = []
    for token in doc:
        if token.lemma != "-PRON-":
            tokens.append(token.lemma_.lower().strip())
        else:
            tokens.append(token.lemma_)
            
    cleanTokens = []
    for token in tokens:
        if token not in stopWords and token not in punct:
            cleanTokens.append(token)
    return cleanTokens

In [None]:
#TFIDF
tfidf = TfidfVectorizer(tokenizer = cleanText)
classifier = LinearSVC()
X = df["clean_review_text"]
y = df["scoreStatus"]
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [None]:
X_train.shape, X_test.shape

In [None]:
clf = Pipeline([('tfidf',tfidf),('clf',classifier)])
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
confusion_matrix(y_test,y_pred)

With TextBlob

In [None]:
pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity
reviews_df["polarity"] = reviews_df["review"].apply(pol)
reviews_df["subjectivity"] = reviews_df["review"].apply(sub)

In [None]:
#distribution of rating
sns.countplot(x='rating', data=reviews_df)

In [None]:
reviews_df.head()

In [None]:
print("negative reviews")
most_negative = reviews_df[reviews_df.polarity == -1].review.head()
print(most_negative)
print("positive reviews")
most_positive = reviews_df[reviews_df.polarity == 1].review.head()
print(most_positive)

Text analysis with gensim and word2vec

In [None]:
sentences = reviews_df['review_tokens'][1:10]
sentences

In [None]:
#train model
model = Word2Vec(sentences, min_count=1)
print(model)

In [None]:
#vocab
words=list(model.wv.vocab)
print(words)

In [None]:
#nltk.download('punkt')
reviewsText = reviews_df.clean_review_text.values
reviewsVec = [nltk.word_tokenize(review) for review in reviewsText]
len(reviewsVec)

In [None]:
model = Word2Vec(reviewsVec,min_count=1, size=32)
model.most_similar('soothing')

In [None]:
model = Word2Vec(reviewsVec,min_count=1, size=32)
model.most_similar('packaging')