In [1]:
# importing the required libraries

import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from rake_nltk import Rake
from collections import defaultdict

In [2]:
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download("vader_lexicon")

In [3]:
readata = pd.read_csv('reviewsthreaded.csv')

In [4]:
array = readata[['rating','text']].values.tolist()

In [5]:
# reading dataset (in 'array')
# communicates with the backend

In [6]:
# ratings in the form of: 4.0 out of 5 stars, hence extracting only required rating rather than the complete sentence

for i in range(len(array)):
    array[i][0] = array[i][0][0]

In [7]:
# converting the dataset into pandas dataframe

data=pd.DataFrame(array, columns=['rating','text'])

In [22]:
# calculating the ratings' count

rating={'1':0,'2':0,'3':0,'4':0,'5':0}
rating_order=data['rating'].value_counts().to_dict()
rating.update(rating_order)
rating

{'1': 0, '2': 0, '3': 20, '4': 61, '5': 47}

In [9]:
# checking if there are any text rows with null value

print(data[data['text'].isnull()])  # rows with null values
print(data[data['text'].apply(lambda x: not isinstance(x, str))])  # rows with non-string values

   rating text
22      4  NaN
   rating text
22      4  NaN


In [10]:
#filling the null values

data['text'] = data['text'].fillna('')  # replace null values with empty string
data['text'] = data['text'].apply(lambda x: '' if not isinstance(x, str) else x)  # replace non-string values with empty string

In [20]:
# extracting keywords using RAKE

r = Rake(min_length=3, max_length=4,stopwords=stopwords.words('english'),
    punctuations = [')','(',',',':','),',').','.'])
def extract_keywords(text):
    r.extract_keywords_from_text(text)
    return r.get_ranked_phrases()

data['keywords'] = data['text'].apply(extract_keywords)

In [21]:
data.head()

Unnamed: 0,rating,text,keywords,Cleaned Reviews,POS tagged
0,4,Nice,[],nice,[]
1,4,Nice phone good camera quality,[],nice phone good camera quality,"[nice phone, good camera]"
2,4,Everything ok but camera quality not good,[],everything ok but camera quality not good,[]
3,4,Feel.....niceBuild..... excellentAudio quality...,"[say goodback cam ...., min hick ups ..., soft...",feel nicebuild excellentaudio quality bestnetw...,"[excellentaudio quality, good manner]"
4,4,battery drainage too fast. as par 5000 mah bat...,[3300 mah battery],battery drainage too fast as par 5000 mah batt...,[good otherwise]


In [12]:
# cleaning the data

def clean(text):
    text = re.sub('[^a-zA-Z0-9]', ' ', text) # Removes special characters  
    text = text.lower() # Converts to lowercase
    text = re.sub('\s+', ' ', text) # Remove extra whitespace
    return text

data['Cleaned Reviews'] = data['text'].apply(clean)

In [13]:
# tokenization

def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    return tokens

In [14]:
# POS tagging

def pos_tag(tokens):
    tagged_tokens = nltk.pos_tag(tokens)
    return tagged_tokens

In [15]:
# extracting keywords from the Cleaned Reviews such that the adjective is followed by a noun

pos_dict = {'J': 'A'}

def token_stop_adjectives(text):
    tags = pos_tag(tokenize_text(text))
    newlist = []
    for i in range(len(tags)-1):
        word, tag = tags[i]
        if word not in set(stopwords.words('english')) and tag[0] in pos_dict:
            if tags[i+1][1][0] == 'N':
                newlist.append((word + ' ' + tags[i+1][0]))
        elif word not in set(stopwords.words('english')) and tag[0] not in pos_dict:
            continue
    return newlist

data['POS tagged'] = data['Cleaned Reviews'].apply(token_stop_adjectives)
data.head()

Unnamed: 0,rating,text,keywords,Cleaned Reviews,POS tagged
0,4,Nice,[],nice,[]
1,4,Nice phone good camera quality,[],nice phone good camera quality,"[nice phone, good camera]"
2,4,Everything ok but camera quality not good,[],everything ok but camera quality not good,[]
3,4,Feel.....niceBuild..... excellentAudio quality...,"[say goodback cam ...., min hick ups ..., soft...",feel nicebuild excellentaudio quality bestnetw...,"[excellentaudio quality, good manner]"
4,4,battery drainage too fast. as par 5000 mah bat...,[3300 mah battery],battery drainage too fast as par 5000 mah batt...,[good otherwise]


In [16]:
sia=SentimentIntensityAnalyzer()

In [17]:
# performing Sentimental Analysis

word_scores = defaultdict(list)

# calculating the sentiment scores for each word in each sentence
for sentence in data['POS tagged']:
    for word in sentence:
        scores = sia.polarity_scores(word)
        word_scores[word].append(scores['compound'])

for sentence in data['keywords']:
    for word in sentence:
        scores = sia.polarity_scores(word)
        word_scores[word].append(scores['compound'])

# sorting the words by compound score
positive_words = sorted(word_scores, key=lambda w: max(word_scores[w]), reverse=True)[:50]
negative_words = sorted(word_scores, key=lambda w: min(word_scores[w]))[:50]

In [25]:
# algorithm to calculate the number of positive and negative words to be returned on the basis of the Ratings

pos={'1': 10, '2': 8, '3': 6, '4': 4,'5': 2}
neg={'1': 2, '2': 4, '3': 6, '4': 8,'5': 10}

count_pos=0
count_neg=0

for key in rating:
    count_pos += round(rating[key]/pos[key])
    count_neg += round(rating[key]/neg[key])

print(count_pos)
print(count_neg)

42
16


In [19]:
# generating an array of product's features

p=0
n=0
returnwords=[]
for word in positive_words:
    if(p<count_pos and max(word_scores[word])>0.5 ):
        returnwords.append({'text':clean(word), 'value':round(max(word_scores[word])*100,2)})
        p=p+1
    else:
        break

for word in negative_words:
    if(n<count_neg):
        returnwords.append({'text':clean(word), 'value':round(min(word_scores[word])*100,2)})
        n=n+1
    else:
        break        
        
returnwords

[{'text': 'great thanks', 'value': 79.06},
 {'text': 'good super', 'value': 77.83},
 {'text': 'good super amoled display', 'value': 77.83},
 {'text': 'best value', 'value': 76.5},
 {'text': 'good hope', 'value': 70.03},
 {'text': 'amazon great india sale', 'value': 70.03},
 {'text': 'still good hope realme', 'value': 70.03},
 {'text': 'good feature good looking', 'value': 70.03},
 {'text': 'efficient good performance', 'value': 69.08},
 {'text': 'good clarity', 'value': 68.08},
 {'text': 'good brightness', 'value': 67.05},
 {'text': 'satisfied thank', 'value': 64.86},
 {'text': 'best phone', 'value': 63.69},
 {'text': 'best colours', 'value': 63.69},
 {'text': 'best stabilization', 'value': 63.69},
 {'text': 'best nahi', 'value': 63.69},
 {'text': 'best mobile', 'value': 63.69},
 {'text': 'best deal', 'value': 63.69},
 {'text': 'best smartphone', 'value': 63.69},
 {'text': 'best camera', 'value': 63.69},
 {'text': 'best multi media experience', 'value': 63.69},
 {'text': 'best nahi ', 