# Libraries needed

In [1]:
import pandas as pd
import string
import re
import nltk
import numpy as np
from nltk import word_tokenize, pos_tag, pos_tag_sents, PunktSentenceTokenizer

# Functions

In [2]:
#Reading dataset(If dataset is in the same folder then just give "name.file-extension", otherwise mention the full path)
filename = input("Enter your filename with extension: ")
pd.set_option('display.max_colwidth',100)
df=pd.read_excel(filename, header=None,names=['Reviews'])    

Enter your filename with extension: Test Dataset 1.xlsx


In [3]:
# Func to remove Punctuation
def remove_punc(reviews):
    reviewsWithNoPunt = "".join([char for char in reviews if char not in string.punctuation])
    return reviewsWithNoPunt

In [4]:
# Func to Tokenize the sentences to words
def tokenize(reviews):
    tokens=re.split('\W+',reviews)
    return tokens

In [5]:
# Func to remove stopwords
def remove_stopwords(Reviews_With_No_Punt_Tokenized):
    stopwords = nltk.corpus.stopwords.words('english')
    stopwordsRemoved=[word for word in Reviews_With_No_Punt_Tokenized if word not in stopwords]
    return stopwordsRemoved

In [6]:
# Func to get all the available POS tags in all the reviews of dataset and also add new columns to original dataframe which 
# are named from list of the available Tags
def get_available_POS_and_append_to_dataframe(dataFrame):
    from collections import Counter
    pd.set_option('max_columns', None)
    posible_POS_Tags=[]
    for i in range(len(dataFrame)):
        counts = Counter(tag for word,tag in dataFrame['POS Tagged'][i])
        counts=dict(counts)
        for key, value in counts.items() :
            if(key not in posible_POS_Tags):
                posible_POS_Tags.append(key)
    total_POS_Tags=len(posible_POS_Tags)
    dataFrame[posible_POS_Tags] = 0
    for i in range(0,len(dataFrame)):
        POS_Tags_Counts=[]
        POS_Tags=[]
        counts = Counter(tag for word,tag in dataFrame['POS Tagged'][i])
        counts=dict(counts)
        for key, value in counts.items() :
            POS_Tags_Counts.append(value)
            POS_Tags.append(key)
        for j in range(0, len(dataFrame.columns)):
            for l in range(len(POS_Tags)):
                if(dataFrame.columns[j]==POS_Tags[l]):
                    dataFrame.at[i,POS_Tags[l]]=POS_Tags_Counts[l]
    return dataFrame

In [7]:
# Func to export dataframe to a file
def export_File(df_to_export):
    external_Filename=filename.split('.')
    external_Filename[0]
    df_to_export.to_excel(external_Filename[0]+' POS Tagged'+'.'+external_Filename[1])

In [8]:
def POS_tagging(dataframe):
    dataframe["POS Tagged"] = ""
    for i in range(len(dataframe)):
        a=nltk.pos_tag(dataframe['Stopwords Removed'][i])
        dataframe['POS Tagged'][i] = a
    return dataframe

In [9]:
# Func to tag POS on realtime, it does punctuation removal, tokenization, stopword removal and then POS Tagging
def POS_for_Realtime_string(realtime_POS_Tagging):
    review_lowered=realtime_POS_Tagging.lower()
    from collections import Counter
    without_punc="".join([w for w in review_lowered if w not in string.punctuation])
    without_punc_tokens=re.split('\W+',without_punc)
    stopwords = nltk.corpus.stopwords.words('english')
    stopwordsRemoved=[word for word in without_punc_tokens if word not in stopwords]
    a=nltk.pos_tag(stopwordsRemoved)
    POS_Tags_Counts=[]
    POS_Tags=[]
    counts = Counter(tag for word,tag in a)
    counts=dict(counts)
    for key, value in counts.items() :
        POS_Tags_Counts.append(value)
        POS_Tags.append(key)
    print(POS_Tags)
    print(POS_Tags_Counts)
    
    
    

# Applying Pre-Processing & POS Tagging

In [10]:
df['Reviews With No Punt']=df['Reviews'].apply(lambda x:remove_punc(x))

df['Reviews With No Punt Tokenized']=df['Reviews With No Punt'].apply(lambda x:tokenize(x.lower()))

df['Stopwords Removed']=df['Reviews With No Punt Tokenized'].apply(lambda x:remove_stopwords(x))

df=POS_tagging(df)
    
df=get_available_POS_and_append_to_dataframe(df)

export_File(df)

# Final Result

In [11]:
df

Unnamed: 0,Reviews,Reviews With No Punt,Reviews With No Punt Tokenized,Stopwords Removed,POS Tagged,VBN,NN,VBZ,JJ,NNS,RB,VBD,IN,VBP,MD,VB,JJR,VBG,JJS,CD,RBR,WDT,FW,RP,PRP
0,What I liked was the quality of the lens and the built in light. Then lens had no discernable d...,What I liked was the quality of the lens and the built in light Then lens had no discernable di...,"[what, i, liked, was, the, quality, of, the, lens, and, the, built, in, light, then, lens, had, ...","[liked, quality, lens, built, light, lens, discernable, distortion, anywhere, magnified, everyth...","[(liked, VBN), (quality, NN), (lens, VBZ), (built, VBN), (light, JJ), (lens, NNS), (discernable,...",5,30,3,24,11,10,5,2,6,1,4,1,2,1,0,0,0,0,0,0
1,"Love the Great point light pocket magnifier! works great, especially if you forget your glasses...",Love the Great point light pocket magnifier works great especially if you forget your glasses a...,"[love, the, great, point, light, pocket, magnifier, works, great, especially, if, you, forget, y...","[love, great, point, light, pocket, magnifier, works, great, especially, forget, glasses, cant, ...","[(love, RB), (great, JJ), (point, NN), (light, NN), (pocket, NN), (magnifier, NN), (works, VBZ),...",2,17,3,9,3,2,1,0,2,1,0,0,0,0,1,0,0,0,0,0
2,"I only gave this 4 stars instead of 5, because of the packaging it comes in. The plastic is sha...",I only gave this 4 stars instead of 5 because of the packaging it comes in The plastic is sharp...,"[i, only, gave, this, 4, stars, instead, of, 5, because, of, the, packaging, it, comes, in, the,...","[gave, 4, stars, instead, 5, packaging, comes, plastic, sharp, isnt, easy, carry, bag, without, ...","[(gave, VBD), (4, CD), (stars, NNS), (instead, RB), (5, CD), (packaging, NN), (comes, VBZ), (pla...",1,10,2,7,1,2,1,3,0,2,3,1,1,0,2,1,0,0,0,0
3,purchased this for someone who has macular degeneration and she actually cried when she used it ...,purchased this for someone who has macular degeneration and she actually cried when she used it ...,"[purchased, this, for, someone, who, has, macular, degeneration, and, she, actually, cried, when...","[purchased, someone, macular, degeneration, actually, cried, used, able, read, newspaper, couple...","[(purchased, VBN), (someone, NN), (macular, JJ), (degeneration, NN), (actually, RB), (cried, VBD...",1,15,2,10,4,5,3,3,1,0,1,1,3,0,0,0,0,0,0,0
4,I recently saw this at a local AC Moore store. They had one open so you could try it out. I on...,I recently saw this at a local AC Moore store They had one open so you could try it out I only...,"[i, recently, saw, this, at, a, local, ac, moore, store, they, had, one, open, so, you, could, t...","[recently, saw, local, ac, moore, store, one, open, could, try, tried, less, 10, minutes, enough...","[(recently, RB), (saw, VBD), (local, JJ), (ac, NN), (moore, NN), (store, NN), (one, CD), (open, ...",3,59,5,30,15,21,6,2,5,1,10,8,8,2,7,2,1,1,0,0
5,"ONE STAR:The Maxell LR44 10-pack photo shows the new hologram packaging, but I received the old ...",ONE STARThe Maxell LR44 10pack photo shows the new hologram packaging but I received the old ora...,"[one, starthe, maxell, lr44, 10pack, photo, shows, the, new, hologram, packaging, but, i, receiv...","[one, starthe, maxell, lr44, 10pack, photo, shows, new, hologram, packaging, received, old, oran...","[(one, CD), (starthe, NN), (maxell, NN), (lr44, VBZ), (10pack, CD), (photo, NN), (shows, VBZ), (...",1,16,2,14,6,1,5,0,1,0,1,1,0,0,4,0,0,0,0,0
6,"Bought 3 packs, prepared to live with a high percentage of bad ones considering that locally the...",Bought 3 packs prepared to live with a high percentage of bad ones considering that locally the ...,"[bought, 3, packs, prepared, to, live, with, a, high, percentage, of, bad, ones, considering, th...","[bought, 3, packs, prepared, live, high, percentage, bad, ones, considering, locally, batteries,...","[(bought, VBD), (3, CD), (packs, NNS), (prepared, VBD), (live, JJ), (high, JJ), (percentage, NN)...",3,9,0,10,7,3,4,1,1,0,1,1,3,0,3,0,0,0,0,0
7,"At 2500ah, these rechargeable NiMh batteries provide more power fully charged that regular alkal...",At 2500ah these rechargeable NiMh batteries provide more power fully charged that regular alkali...,"[at, 2500ah, these, rechargeable, nimh, batteries, provide, more, power, fully, charged, that, r...","[2500ah, rechargeable, nimh, batteries, provide, power, fully, charged, regular, alkalines, bicy...","[(2500ah, CD), (rechargeable, JJ), (nimh, JJ), (batteries, NNS), (provide, VBP), (power, NN), (f...",1,8,1,10,5,2,2,1,2,0,1,0,0,0,2,0,0,0,0,0
8,This product is advertised as the NEW Energizer Rechargeable NiMH AA Batteries. But once you rec...,This product is advertised as the NEW Energizer Rechargeable NiMH AA Batteries But once you rece...,"[this, product, is, advertised, as, the, new, energizer, rechargeable, nimh, aa, batteries, but,...","[product, advertised, new, energizer, rechargeable, nimh, aa, batteries, received, may, shocked,...","[(product, NN), (advertised, VBD), (new, JJ), (energizer, NN), (rechargeable, JJ), (nimh, JJ), (...",5,54,1,40,20,10,8,0,7,1,2,4,4,0,15,2,0,0,1,1


In [12]:


realtime_POS_Tagging="""At 2500ah, these rechargeable NiMh batteries provide more power fully 
charged that regular alkalines.  
My bicycle headlight shines brighter when I use these.  
In my digital camera it makes it able to shoot 3 frames per second.  
Get the Sony charger to go with these.  They will last a long time.       

I am sold on NiMh rechargeable batteries!"""


POS_for_Realtime_string(realtime_POS_Tagging)
    


['CD', 'JJ', 'NNS', 'VBP', 'NN', 'RB', 'VBD', 'VBZ', 'IN', 'VB', 'VBN']
[2, 10, 5, 2, 8, 2, 2, 1, 1, 1, 1]
