# Data Loading

In [3]:
import pandas as pd

In [4]:
df =  pd.read_csv("C:\\Users\\Shubham Verma\\Project\\TextAnalytics\\amazon_dataset\\Womens Clothing E-Commerce Reviews.csv")

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


# EDA

In [6]:
df['Review Text'].isna().sum()

845

In [7]:
# Dropping the missing values in the rows
df = df.dropna(subset=['Review Text','Title'], axis=0)
df = df.reset_index(drop=True)

In [8]:
df.isna().sum()

Unnamed: 0                  0
Clothing ID                 0
Age                         0
Title                       0
Review Text                 0
Rating                      0
Recommended IND             0
Positive Feedback Count     0
Division Name              13
Department Name            13
Class Name                 13
dtype: int64

# Data Preprocessing

In [9]:
# taking only one clothing id
df = df[df['Clothing ID']==1078][['Title','Review Text']]

In [10]:
# data cleaning
def data_preprocessing(para):
    import nltk
    from nltk.corpus import stopwords
    import string
    import re
    import spacy
    nlp = spacy.load("en_core_web_sm")
    
    para = str(para)
    stop_words = list(set(stopwords.words('english')))
    stop_words.extend(['say','alright','would','could',"okay","maybe","may","cant"])
    
    def clean(para):
        punct = set(string.punctuation)
        punct = list(set(string.punctuation))
#         punct.remove('.')
#         para = re.sub('[^a-zA-Z.]', ' ',para)
        para = para.lower().split('.')
        punc_free = [i for i in para if i not in punct]
        clean_doc = ". ".join(punc_free)
        return clean_doc
    
    clean_para = clean(para)

    allowed_postags = ['NOUN','VERB','ADV','ADJ','PROPN']
    
    def lemmatization(paragraph, allowed_postags):
        texts_lemmatized = []
        texts_stopfree = []
        doc = nlp(paragraph)
        texts_lemmatized = [token.lemma_ for token in doc if token.pos_ in allowed_postags]
        texts_stopfree = [text for text in texts_lemmatized if text not in stop_words]
        texts_stopfree = " ".join(texts_stopfree)
        return texts_stopfree
     
    data_lemmatized = lemmatization(clean_para, allowed_postags)
    return data_lemmatized 

In [130]:
data_preprocessing("I really wanted this to work. alas, it had a strange fit for me. the straps would not stay up, and it had a weird fit under the breast. it worked standing up, but the minute i sat down it fell off my shoulders. the fabric was beautiful! and i loved that it had pockets.")

'really want work strange fit strap stay weird fit breast work stand minute sit fall shoulder fabric beautiful love pocket'

In [28]:
# reviewlist = [sent for sent in df['Review Text']]

In [29]:
# reviewlist[:3]

In [15]:
# reviewlist = [data_preprocessing(para) for para in reviewlist]

In [12]:
# df['reviewlist'] = reviewlist

# Aspect-Opinion Mining

In [11]:
# !python -m spacy download en_core_web_sm

In [19]:
import stanza

In [23]:
stanza.download('en')     

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.3.0.json:   0%|   …

2022-03-10 14:03:41 INFO: Downloading default packages for language: en (English)...
2022-03-10 14:03:42 INFO: File exists: C:\Users\Shubham Verma\stanza_resources\en\default.zip.
2022-03-10 14:03:47 INFO: Finished downloading models and saved to C:\Users\Shubham Verma\stanza_resources.


In [24]:
nlp = stanza.Pipeline('en') # This sets up a default neural pipeline in English

2022-03-10 14:03:47 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| depparse     | combined  |
| sentiment    | sstplus   |
| constituency | wsj       |
| ner          | ontonotes |

2022-03-10 14:03:47 INFO: Use device: cpu
2022-03-10 14:03:47 INFO: Loading: tokenize
2022-03-10 14:03:47 INFO: Loading: pos
2022-03-10 14:03:48 INFO: Loading: lemma
2022-03-10 14:03:48 INFO: Loading: depparse
2022-03-10 14:03:48 INFO: Loading: sentiment
2022-03-10 14:03:49 INFO: Loading: constituency
2022-03-10 14:03:49 INFO: Loading: ner
2022-03-10 14:03:50 INFO: Done loading processors!


In [131]:
#testing on cleaned paragraph : not working well as POS tags are changed after cleaning
nlp('really want work strange fit strap stay weird fit breast work stand minute sit fall shoulder fabric beautiful love pocket').sentences[0].print_dependencies()

('really', 2, 'advmod')
('want', 0, 'root')
('work', 6, 'compound')
('strange', 6, 'amod')
('fit', 6, 'compound')
('strap', 2, 'obj')
('stay', 2, 'xcomp')
('weird', 11, 'amod')
('fit', 11, 'compound')
('breast', 11, 'compound')
('work', 13, 'compound')
('stand', 13, 'compound')
('minute', 14, 'compound')
('sit', 20, 'amod')
('fall', 20, 'compound')
('shoulder', 20, 'compound')
('fabric', 20, 'compound')
('beautiful', 20, 'amod')
('love', 20, 'compound')
('pocket', 2, 'parataxis')


In [25]:
#creating reviews list 
reviewText = [review for review in df['Review Text']]

In [179]:
# reviewText

In [122]:
# aspect based opinion mining by splitting each row on '.'
senti_aspects=[]
parag_aspects=[]

def aspect_opinion_mining(reviewText):
    for para in reviewText:
    #     print(para)
        for sen in para.split('.'): 
            important = nlp(sen)
            descriptive_item = ''
            target = ''
            added_terms=''
            for sent in important.sentences:
    #             print(sent)
                for wrd in sent.words:
    #                 print(wrd.text,wrd.pos,wrd.deprel)
                    if ((wrd.deprel == 'nsubj' and wrd.pos == 'NOUN') or(wrd.deprel == 'nsubj' and wrd.pos == 'PROPN')):
                        target = wrd.text
                    if wrd.pos=='ADV' and wrd.deprel =='advmod':
                        added_terms = wrd.text
                    if wrd.pos == 'ADJ':
                        descriptive_item = added_terms +' '+ wrd.text
                    if target!='' and wrd.pos=='NOUN' and wrd.deprel =='root':
                        descriptive_item = added_terms +' '+ wrd.text                    
                    if target!='' and descriptive_item!='':
                        senti_aspects.append({'aspect': target,'opinion': descriptive_item})
                        descriptive_item = ''
                        target = ''
                        added_terms=''
    #     if(senti_aspects!=[])
        parag_aspects.append(senti_aspects)
        senti_aspects=[]
    return parag_aspects

In [181]:
parag_aspects = aspect_opinion_mining(reviewText)

In [166]:
len(parag_aspects)

871

In [144]:
parag_aspects

[[{'aspect': 'straps', 'opinion': ' weird'},
  {'aspect': 'fabric', 'opinion': ' beautiful'}],
 [{'aspect': 'one', 'opinion': ' cute'},
  {'aspect': 'material', 'opinion': 'well difficult'}],
 [{'aspect': 'petite', 'opinion': ' grey'}],
 [{'aspect': 'problem', 'opinion': ' hem'}],
 [{'aspect': 'pattern', 'opinion': 'just fun'},
  {'aspect': 'skirt', 'opinion': ' flattering'},
  {'aspect': 'material', 'opinion': 'too heavy'},
  {'aspect': 'reviews', 'opinion': ' other'},
  {'aspect': 'dress', 'opinion': ' short'}],
 [],
 [{'aspect': 'colors', 'opinion': ' cheerful'}],
 [{'aspect': 'dress', 'opinion': 'really tall'}],
 [{'aspect': 'colors', 'opinion': ' fun'},
  {'aspect': 'top', 'opinion': ' fine'},
  {'aspect': 'material', 'opinion': ' great'}],
 [],
 [{'aspect': 'dress', 'opinion': 'really cute'}],
 [{'aspect': 'dress', 'opinion': 'very comfortable'},
  {'aspect': 'issue', 'opinion': ' only'},
  {'aspect': 'skirt', 'opinion': 'quite short'},
  {'aspect': 'dress', 'opinion': ' sturdy'}

In [124]:
# joining aspects with opinions to create meaningful labels
paragraph,sentence=[],[]
for para in parag_aspects:
    for asp in para:
#         print(asp)
        label = '-'.join(asp.values())
        sentence.append(label)
    paragraph.append(sentence)
    sentence=[]

In [127]:
df['aspect-opinion']=paragraph

In [180]:
df[['Review Text','aspect-opinion']]

Unnamed: 0,Review Text,aspect-opinion
54,"I really wanted this to work. alas, it had a s...","[straps- weird, fabric- beautiful]"
71,"I love cute summer dresses and this one, espec...","[one- cute, material-well difficult]"
374,"Nice fit and flare style, not clingy at all. i...",[petite- grey]
377,When i first opened this dress and tried it on...,[problem- hem]
381,I love this sweater dress and get compliments ...,"[pattern-just fun, skirt- flattering, material..."
...,...,...
19256,I love this dress. i agree with the other revi...,"[embroidery- beautiful, complaint- only]"
19261,"Beautiful, unique design. it's very flattering...","[fabric-very flattering, lining-much smaller, ..."
19266,This is the most beautiful dress i've ever own...,[detail- exquisite]
19651,I purchased this for a very good price and i t...,"[fabric- thin, photo- least]"


In [147]:
#extracting aspects and opinions for each row.
# para_aspects,sent_aspects = [],[]
# para_opinions,sent_opinions = [],[]
# for para in parag_aspects:
#     for asp in para:
#         sent_aspects.append(asp['aspect'])
#         sent_opinions.append(asp['opinion'])
#     para_aspects.append(sent_aspects)
#     para_opinions.append(sent_opinions)
#     sent_aspects=[]
#     sent_opinions=[]

In [150]:
# df['aspects'] = para_aspects

In [151]:
# df['opinions'] = para_opinions

In [163]:
#combining dict's aspect-values and appending opinion-values to gain insights about each aspect.
from collections import defaultdict
d=defaultdict(list)
for para in parag_aspects:
    for dictn in para:
        d[dictn['aspect']].append(dictn['opinion'])

In [164]:
d

defaultdict(list,
            {'straps': [' weird',
              ' front',
              ' adjustable',
              'usually adjustable'],
             'fabric': [' beautiful',
              ' thick',
              ' Comfortable',
              'very soft',
              ' buttery',
              ' heavy',
              'so much',
              'so disappointed',
              ' other',
              'simply gorgeous',
              'quite soft',
              ' lower',
              ' high',
              ' best',
              ' soft',
              ' substantial',
              'very stiff',
              ' thin',
              ' nice',
              ' thin',
              'quite loose',
              'still weird',
              'very soft',
              ' soft',
              ' light',
              ' lovely',
              'too much',
              'absolutely gorgeous',
              'too much',
              ' thin',
              'very stiff',
              ' maroon',
    

In [129]:
#saving the final output in csv
df.to_csv('Womens Clothing E-Commerce Reviews Aspect-Opinion.csv')