In [54]:
from bs4 import BeautifulSoup as bs
import re,csv, os, itertools, pandas as pd,docx2txt
from tqdm import tqdm
from pattern.web import PDF
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize
from spacy.lang.id import Indonesian
from html import unescape
from unidecode import unidecode
from bz2 import BZ2File as bz2
from textblob import TextBlob
import spacy

def LoadStopWords(lang):
    L = lang.lower().strip()
    if L == 'en' or L == 'english' or L == 'inggris':
        lemmatizer = WordNetLemmatizer()
        stops =  set([t.strip() for t in LoadDocuments(file = 'C:/WinPython_64bit/notebooks/Google-Play-Store-Review-Extractor-master/stopwords_eng.txt')[0]])
    elif L == 'id' or L == 'indonesia' or L=='indonesian':
        lemmatizer = Indonesian() 
        stops = set([t.strip() for t in LoadDocuments(file = 'C:/WinPython_64bit/notebooks/Google-Play-Store-Review-Extractor-master/stopwords_id.txt')[0]])
    else:
        print('Warning, language not recognized. Empty StopWords Given')
        stops = set(); lemmatizer = None
    return stops, lemmatizer

def fixTags(T):
    getHashtags = re.compile(r"#(\w+)")
    pisahtags = re.compile(r'[A-Z][^A-Z]*')
    t = T
    tagS = re.findall(getHashtags, T)
    for tag in tagS:
        proper_words = ' '.join(re.findall(pisahtags, tag))
        t = t.replace('#'+tag,proper_words)
    return t

def readBz2(file):
    with bz2(file, "r") as bzData:
        txt = []
        for line in bzData:
            try:
                txt.append(line.strip().decode('utf-8','replace'))
            except:
                pass
    return ' '.join(txt)

def LoadDocuments(dPath=None,types=None, file = None): # types = ['pdf','doc','docx','txt','bz2']
    Files, Docs = [], []
    if types:
        for tipe in types:
            Files += crawlFiles(dPath,tipe)
    if file:
        Files = [file]
    if not types and not file: # get all files regardless of their extensions
        Files += crawlFiles(dPath)
    for f in Files:
        if f[-3:].lower()=='pdf':
            try:
                Docs.append(PDF(f).string)
            except:
                print('error reading{0}'.format(f))
        elif f[-3:].lower()=='txt' or f[-3:].lower()=='dic':
            try:
                df=open(f,"r",encoding="utf-8", errors='replace')
                Docs.append(df.readlines());df.close()
            except:
                print('error reading{0}'.format(f))
        elif f[-3:].lower()=='bz2':
            try:
                Docs.append(readBz2(f))
            except:
                print('error reading{0}'.format(f))
        elif f[-4:].lower()=='docx':
            try:
                Docs.append(docx2txt.process(f))
            except:
                print('error reading{0}'.format(f))
        elif f[-3:].lower()=='csv':
            Docs.append(pd.read_csv(f))
        else:
            print('Unsupported format {0}'.format(f))
    if file:
        Docs = Docs[0]
    return Docs, Files

def DelPic(text): #untuk menghilangkan informasi gambar
    D = text.split()
    D = [d for d in D if 'pic.twitter.com' not in d]
    return ' ' .join(D)

def LoadSlang(DirSlang):
    Slangs =LoadDocuments(file = DirSlang)
    SlangDict={}
    for slang in Slangs[0]:
        try:
            key, value = slang.split(':')
            SlangDict[key.strip()] = value.strip()
        except:
            pass
    return SlangDict

#POS Tagging
from nltk.tag import CRFTagger
def postag(text):
    #Tokenisasi Data
    tokenized_sents = word_tokenize(text)
    #pemberian Tag tiap token
    ct = CRFTagger()
    ct.set_model_file('C:/WinPython_64bit/notebooks/Google-Play-Store-Review-Extractor-master/CRFTagger-1.0/CRFTagger/model/model.txt') 
    #directorynya disesuaikan meletakan file crfnya, harus download dlu file crfnya
    pt = ct.tag(tokenized_sents)
    ptN = []
    noun = set(['NN','NNP', 'NNS','NNPS'])
    tmp = []
    for w in pt:
        if w[1] in noun:
            tmp.append(w[0])
    if len(tmp)>0:
        ptN.append(' '.join(tmp))
    return ' '.join(ptN)

def cleanText(T, fix={}, lang = 'id', lemma=None, stops = set(), symbols_remove = False, min_charLen = 0): 
    # lang & stopS only 2 options : 'en' atau 'id'
    # symbols ASCII atau alnum
    pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    t = re.sub(pattern,' ',T) #remove urls if any
    t = DelPic(t)
    t = unescape(t) # html entities fix
    t = fixTags(t) # fix abcDef
    t = t.lower().strip() # lowercase
    t = unidecode(t)
    t = ''.join(''.join(s)[:2] for _, s in itertools.groupby(t)) # remove repetition
    t = sent_tokenize(t) # sentence segmentation. String to list
    for i, K in enumerate(t):
        if symbols_remove:
            K = re.sub(r'[^.,a-zA-Z0-9 \n\.]',' ',K)
        
        cleanList = []
        if lang =='en':
            listKata = word_tokenize(K) # word tokenize
            for token in listKata:
                if token in fix.keys():
                    token = fix[token]
                if lemma:
                    token = lemma.lemmatize(token)
                if stops:
                    if len(token)>=min_charLen and token not in stops:
                        cleanList.append(token)
                else:
                    if len(token)>=min_charLen:
                        cleanList.append(token)
            t[i] = ' '.join(cleanList)
        else:
            if lemma:
                K = lemma(K)
                listKata = [token.text for token in K]
            else:
                listKata = TextBlob(K).words
                
            for token in listKata:
                if token in fix.keys():
                    token = fix[token]
                
                if lemma:
                    token = lemma(token)[0].lemma_
                if stops:    
                    if len(token)>=min_charLen and token not in stops:
                        cleanList.append(token)
                else:
                    if len(token)>=min_charLen:
                        cleanList.append(token)
            t[i] = ' '.join(cleanList)
    return ' '.join(t) 

In [55]:
data = pd.read_csv('C:/WinPython_64bit/notebooks/Google-Play-Store-Review-Extractor-master/review_tokped.csv')
listReview=data['Review']

In [56]:
#Tanpa POS Tag
import pickle
from tqdm import tqdm_notebook as tqdm
import nltk
from nltk import word_tokenize

Slangs=LoadSlang( 'C:/WinPython_64bit/notebooks/Google-Play-Store-Review-Extractor-master/slang.txt')
listRev = [d for d in data['Review']]

stops, lemmatizer = LoadStopWords(lang='en')
for i,d in tqdm(enumerate(listRev)):
    try:
        #i=i.translate(to='en')
        #i= nltk.word_tokenize(i)
        listRev[i] = cleanText(d,Slangs, lemma=lemmatizer,lang='en', stops = stops, symbols_remove = True, min_charLen =3)
    except:
        print(i)
#print("done!!! {0} review".format(i+1))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [57]:
stops, lemmatizer = LoadStopWords(lang='en')
for i,d in tqdm(enumerate(listRev)):
    try:
        #i=i.translate(to='en')
        #i= nltk.word_tokenize(i)
        listRev[i] = cleanText(d,fix={}, lemma=lemmatizer,lang='en', stops = stops, symbols_remove = True, min_charLen =3)
    except:
        print(i)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [58]:
listRev

['recommended',
 'relied cheap mundane item pricy gadget extra care handling shipping delivered expected satisfied choice store courier user friendly design comprehensive',
 'ovo shovel treatment dissapointed narrow money expect happy use point platform awful asinine use cashback turn cannae ongoing payment purchase item put..full',
 'feature failed much like chat feature crashed wouldn load improvement couldn load chat experience time month uninstall install still..full',
 'good functioned well minimum crash',
 'fly brother great work proudness heart',
 'favourite update waiting long enjoy online',
 'simple market place great job workin',
 'instant delivery not adjustment easier understand',
 'stupid order like month order package',
 'wonderful indonesia',
 'good',
 'best',
 'nice uninstall change tokocash ovo',
 'apps stable secure online transaction',
 'good useful',
 'great complete feature product',
 'good aplication..easy use',
 'server error annoying good',
 'mantaplah tampilan 

listRev

In [59]:
#cek masih ada stopword gak di listRev
splitdata = []
for idx, i in enumerate(listRev):
    try:
        i = i.split()
        splitdata.append(i)
        for d in i:
            if d in stopwords[0]:
                print("{},{}".format(d,idx))
    except:
        None

udah gak ada stopword, tapi masih ada simbol ..
coba tanpa dihapus dulu bisa diinterpretasiin gak nanti

In [60]:
#coba hapus stopword lang='id'
stops, lemmatizer = LoadStopWords(lang='id')
for i,d in tqdm(enumerate(listRev)):
    try:
        #i=i.translate(to='en')
        #i= nltk.word_tokenize(i)
        listRev[i] = cleanText(d,fix={}, lemma=lemmatizer,lang='id', stops = stops, symbols_remove = True, min_charLen =2)
    except:
        print(i)


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [61]:
#save to csv file 
data_clean=pd.DataFrame(listRev,columns=['Cleaned_review'])
Clean=pd.concat([data['Date'],data['Rating'],data['Review'],data_clean], axis=1)

In [62]:
def data_to_csv(filename, data):
    """Export entire DataFrame to csv."""
    output = data
    output.to_csv(filename, index=True)
data_to_csv('C:/WinPython_64bit/notebooks/Google-Play-Store-Review-Extractor-master/tokped_bersih3.csv', Clean)

In [63]:
Clean

Unnamed: 0,Date,Rating,Review,Cleaned_review
0,17-Oct-18,5,Recomended app,recommended
1,12-Nov-18,5,I have relied on Tokopedia to obtain from chea...,relied cheap mundane item pricy gadget extra c...
2,13-Nov-18,2,What's with OVO shovel treatment? Dissapointed...,ovo shovel treatment dissapointed narrow money...
3,13-Nov-18,2,The features failed way too much (like the cha...,feature failed much like chat feature crashed ...
4,17-Nov-18,5,"good app, functioned well minimum crash",good functioned well minimum crash
5,21-Nov-18,5,"Fly brethren, keep up the great work. Proudnes...",fly brother great work proudness heart
6,22-Nov-18,5,"Still favourite one! Even if there's update, t...",favourite update waiting long enjoy online
7,22-Nov-18,5,Most Simple UI Marketplace i've ever used. Gre...,simple market place great job workin
8,22-Nov-18,5,"Instant delivery is not shown in application ,...",instant delivery not adjustment easier understand
9,22-Nov-18,1,Heck this app is so stupid where's my order it...,stupid order like month order package


In [64]:
#cek data nan
text = pd.read_csv('C:/WinPython_64bit/notebooks/Google-Play-Store-Review-Extractor-master/tokped_bersih3.csv')
nan=text['Cleaned_review']
splitting = []

for idx, i in enumerate(nan):
    try:
        i = i.split()
        splitting.append(i)
    except:
        print("{},{}".format(i,idx))

nan,75
nan,883
nan,938
nan,958
nan,1040
nan,1076
nan,1163
nan,1225
nan,1348
nan,1397
nan,1578
nan,1593
nan,1769
nan,1841
nan,1859
nan,1993
nan,2004
nan,2055
nan,2067
nan,2091
nan,2154
nan,2193
nan,2199
nan,2231
nan,2277
nan,2364
nan,2437
nan,2530
nan,2789
nan,2871
nan,2873
nan,2982


In [65]:
nan.isna().sum() #hitung jumlah nan/data kosong

32

In [66]:
# coba lihat VSM nya
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

df = pd.read_csv('tokped_bersih3.csv')
cleanreview = df['Cleaned_review']

listdf=cleanreview.values.astype('U')
listdf = [d for d in listdf]

Tfidf_vectorizer = TfidfVectorizer(max_df=0.75, min_df=5)
tfidf = Tfidf_vectorizer.fit_transform(listdf)
tfidf_term = Tfidf_vectorizer.get_feature_names()
print(tfidf.shape)

(3077, 325)


In [67]:
print(tfidf)

  (0, 237)	1.0
  (1, 40)	0.3278717567461708
  (1, 151)	0.2686588594347588
  (1, 34)	0.3753775377636099
  (1, 261)	0.3245257086571764
  (1, 74)	0.36046244470293454
  (1, 248)	0.2919350207004126
  (1, 43)	0.29935419139037467
  (1, 277)	0.2586576240172321
  (1, 65)	0.3488933997312536
  (1, 312)	0.19846701856963056
  (1, 113)	0.20323780746883618
  (2, 151)	0.2567704039599487
  (2, 201)	0.2806964423753798
  (2, 177)	0.2806964423753798
  (2, 127)	0.29910798075131645
  (2, 308)	0.30537537981461055
  (2, 216)	0.3043287542738001
  (2, 213)	0.29910798075131645
  (2, 36)	0.2567704039599487
  (2, 301)	0.351128260992732
  (2, 206)	0.24808877533178522
  (2, 227)	0.29910798075131645
  (2, 114)	0.26730314614786016
  (3, 114)	0.19394886524032037
  :	:
  (3059, 87)	0.5871416965134022
  (3060, 122)	1.0
  (3061, 184)	0.4870642322615265
  (3061, 268)	0.8733661509650404
  (3062, 131)	1.0
  (3063, 305)	0.657517860496867
  (3063, 204)	0.7534389577979246
  (3064, 140)	1.0
  (3065, 184)	1.0
  (3066, 122)	0.6031

In [68]:
# coba lihat VSM nya

from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

df = pd.read_csv('tokped_bersih3.csv')
cleanreview = df['Cleaned_review']

listdf=cleanreview.values.astype('U')
listdf = [d for d in listdf]

Tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=1) #nilai default max_df dan min_df
tfidf = Tfidf_vectorizer.fit_transform(listdf)
tfidf_term = Tfidf_vectorizer.get_feature_names()
print(tfidf.shape)

(3077, 2097)


max_df = 0.5 means "ignore terms that appear in more than 50% of the documents".

max_df = 25 means "ignore terms that appear in more than 25 documents".

The default max_df is 1.0, which means "ignore terms that appear in more than 100% of the documents". Thus, the default setting does not ignore any terms.


min_df = 0.01 means "ignore terms that appear in less than 1% of the documents".

min_df = 5 means "ignore terms that appear in less than 5 documents".

The default min_df is 1, which means "ignore terms that appear in less than 1 document". Thus, the default setting does not ignore any terms.

In [69]:
print(tfidf)

  (0, 1479)	1.0
  (1, 1504)	0.27082555401379893
  (1, 317)	0.20538670676412082
  (1, 1186)	0.27082555401379893
  (1, 965)	0.16829433230208962
  (1, 1382)	0.27082555401379893
  (1, 756)	0.27082555401379893
  (1, 656)	0.24106679707706485
  (1, 287)	0.23514546370085493
  (1, 821)	0.24831392342524583
  (1, 1662)	0.2032906622481396
  (1, 486)	0.22580229283669273
  (1, 646)	0.25765709428940803
  (1, 1598)	0.18287507617556772
  (1, 329)	0.18752262206379705
  (1, 1780)	0.16202931933981507
  (1, 431)	0.21855516648851175
  (1, 1992)	0.12432448512747997
  (1, 742)	0.12731302134785727
  (1, 494)	0.24831392342524583
  (1, 389)	0.25765709428940803
  (2, 965)	0.15684235033934846
  (2, 1284)	0.17145702571280422
  (2, 1673)	0.2523965949562342
  (2, 1924)	0.2523965949562342
  :	:
  (3067, 782)	0.5162188952022756
  (3067, 1211)	0.8564566843899005
  (3068, 1211)	0.7267494443511859
  (3068, 1664)	0.6869026460389
  (3069, 1687)	0.45729068391371397
  (3069, 574)	0.2819767042360588
  (3069, 86)	0.843430120800

In [70]:
#baris kosong di vsm
baris_kosong = []
nBaris, nKolom = tfidf.shape
for i in range(nBaris):
    if sum(tfidf[i].data)==0:
        baris_kosong.append(i)
print('Jumlah baris kosong di VSM tfidf = ', len(baris_kosong))
print(baris_kosong)

Jumlah baris kosong di VSM tfidf =  0
[]


In [71]:
#ignore nan data

tfidf_nonZeroRows = tfidf[tfidf.getnnz(1)>0] # Remove Zero Rows
tfidf_nonZeroCols = tfidf[:,tfidf.getnnz(0)>0] # Remove Zero Columns. Becareful, it "might" change the VSM interpretation (word index)
tfidf_nonZeroRC = tfidf[tfidf.getnnz(1)>0][:,tfidf.getnnz(0)>0] # Remove Zero Rows and Columns
print(tfidf_nonZeroRows.shape, tfidf_nonZeroCols.shape, tfidf_nonZeroRC.shape)
# Jika jumlah kolom tidak berubah, maka interpretasi (index kata) VSM tidak berubah (aman :) ).

(3077, 2097) (3077, 2097) (3077, 2097)


In [72]:
tfidf[0]

<1x2097 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [73]:
print(tfidf[0])

  (0, 1479)	1.0


## ini pake tf

In [74]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
tf_vectorizer = CountVectorizer(binary = False, lowercase=True, stop_words='english')
tf = tf_vectorizer.fit_transform(listdf)
print(tf.shape)

(3077, 2029)


In [75]:
print(tf)

  (0, 1432)	1
  (1, 374)	1
  (1, 477)	1
  (1, 721)	1
  (1, 1932)	1
  (1, 415)	1
  (1, 1730)	1
  (1, 315)	1
  (1, 1551)	1
  (1, 625)	1
  (1, 470)	1
  (1, 1615)	1
  (1, 797)	1
  (1, 273)	1
  (1, 635)	1
  (1, 734)	1
  (1, 1336)	1
  (1, 938)	1
  (1, 1151)	1
  (1, 303)	1
  (1, 1457)	1
  (2, 1385)	1
  (2, 1260)	1
  (2, 1214)	1
  (2, 269)	1
  :	:
  (3067, 1175)	1
  (3067, 758)	1
  (3068, 1617)	1
  (3068, 1175)	1
  (3069, 82)	1
  (3069, 557)	1
  (3069, 1640)	1
  (3070, 1928)	1
  (3071, 1599)	1
  (3071, 758)	1
  (3072, 758)	3
  (3073, 939)	1
  (3073, 948)	1
  (3073, 1033)	1
  (3073, 758)	1
  (3074, 187)	2
  (3075, 86)	1
  (3075, 1518)	1
  (3075, 758)	1
  (3075, 1012)	1
  (3076, 1650)	1
  (3076, 207)	1
  (3076, 177)	1
  (3076, 241)	2
  (3076, 969)	1
