In [18]:
import pandas as pd

In [19]:
#xls = pd.ExcelFile("Comments (2).xlsx")
df = pd.read_excel('Comments (2).xlsx')

In [20]:
df.head()

Unnamed: 0,comment_id,Comment
0,5,djfjkdfjkjkffdk edited
1,41,Faith has exhibited enthusiasm in taking on th...
2,49,He now has now understood the structure of gra...
3,50,The Intern was oriented on ICT setup and Infra...
4,52,The student was oriented on the organization s...


In [21]:
df.Comment=df.Comment.astype(str)

Cleaning the text data with various operations:

In [22]:
# return the wordnet object value corresponding to the POS tag
from nltk.corpus import wordnet

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [23]:
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

In [24]:
# clean text data
df["review_clean"] = df["Comment"].apply(lambda x: clean_text(x))

In [25]:
df.head()

Unnamed: 0,comment_id,Comment,review_clean
0,5,djfjkdfjkjkffdk edited,djfjkdfjkjkffdk edit
1,41,Faith has exhibited enthusiasm in taking on th...,faith exhibit enthusiasm take project hand alr...
2,49,He now has now understood the structure of gra...,understood structure grail different component...
3,50,The Intern was oriented on ICT setup and Infra...,intern orient ict setup infrastructure soroti ...
4,52,The student was oriented on the organization s...,student orient organization structure develop ...


<p>To clean textual data, we call our custom ‘clean_text’ function that performs several transformations:</p>

<ul>
<li>lower the text</li>
<li>tokenize the text (split the text into words) and remove the punctuation</li>
<li>remove useless words that contain numbers</li>
<li>remove useless stop words like ‘the’, ‘a’ ,’this’ etc.</li>
<li>Part-Of-Speech (POS) tagging: assign a tag to every word to define if it corresponds to a noun, a verb etc. using the WordNet lexical database</li>
<li>lemmatize the text: transform every word into their root form (e.g. rooms -> room, slept -> sleep)</li>
</ul>
<p>Now that we have cleaned our data, we can do some feature engineering for our modelization part.</p>

In [29]:
import gensim
from gensim.utils import simple_preprocess
from gensim import corpora, models
from gensim.parsing.preprocessing import STOPWORDS
import nltk

Tokenize the comments

Let’s tokenize each sentence into a list of words

In [49]:
result = []
for sentence in df["review_clean"]:
     tokens = [word for word in sentence.split()]
     result.append(tokens)
    

In [50]:
##create dictionary based on the preprocessed_documents
dictionary = gensim.corpora.Dictionary(result)

dictionary

<gensim.corpora.dictionary.Dictionary at 0x24737b16290>

In [51]:
##check the dictionary
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 5:
        break

0 djfjkdfjkjkffdk
1 edit
2 already
3 concept
4 core
5 discover


In [52]:
## remove extreme words (very common and very rare)
dictionary.filter_extremes(no_below=15, no_above=0.1)

##create bag-of-word model for each documents
bow_corpus = [dictionary.doc2bow(doc) for doc in result]

In [53]:
bow_corpus

[[],
 [(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1)],
 [(15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1)],
 [(12, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1)],
 [(20, 1),
  (26, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1)],
 [(38, 1)],
 [(13, 1), (39, 1)],
 [(2, 1), (24, 1), (40, 1), (41, 1)],
 [(42, 1)],
 [(43, 1), (44, 1), (45, 1), (46, 1), (47, 1)],
 [(48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1)],
 [(24, 1), (49, 1), (54, 1), (55, 1), (56, 1)],
 [(49, 1)],
 [(24, 1),
  (45, 1),
  (47, 1),
  (50, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1)],
 [(12, 1), (13, 1), (38, 1), (56, 1)],
 [(46, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1)],
 [(67, 1), (68, 1), (69, 1), (70, 1), (71, 1)],
 [(25, 1), (46, 1), (64, 1), (72, 1), (73, 1)],
 [(35, 1),
  (64, 1),


Our Corpus is as below:

In [59]:
## check the bow_corpus
bow_doc_100 = bow_corpus[90]

for i in range(len(bow_doc_100)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_100[i][0], 
                                                     dictionary[bow_doc_100[i][0]], 
                                                     bow_doc_100[i][1]))

Word 110 ("something") appears 1 time.
Word 245 ("asks") appears 1 time.
Word 246 ("sure") appears 1 time.


In [64]:
bow_corpus[:3]

[[],
 [(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1)],
 [(15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1)]]

In [65]:
# Human readable format of corpus (term-frequency)
[[(dictionary[id], freq) for id, freq in cp] for cp in bow_corpus[:3]]

[[],
 [('already', 1),
  ('concept', 1),
  ('enthusiasm', 1),
  ('exhibit', 1),
  ('forward', 1),
  ('go', 1),
  ('hand', 1),
  ('look', 1),
  ('project', 1),
  ('show', 1),
  ('sign', 1),
  ('software', 1),
  ('take', 1),
  ('time', 1),
  ('understand', 1)],
 [('component', 1),
  ('different', 1),
  ('get', 1),
  ('process', 1),
  ('relate', 1),
  ('structure', 1),
  ('understood', 1)]]

We now build a model to cluster the comments

Create a Named Entity Recognition (NER) model that takes in a comment as an input and outputs the Entities, if any, belonging to the categories: Person, Organization, Place/Location, Time 

In [84]:
#importing necessary libraries 
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")  #creating an object and loading the pre-trained model for "English" 

In [85]:
sentence = "Apple is looking at buying U.K. startup for $1 billion"
  
doc = nlp(sentence)
  
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [86]:
for ent in doc.ents:
  print(ent.text,"|", ent.label_)

Apple | ORG
U.K. | GPE
$1 billion | MONEY
