In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
from sklearn.svm import SVC
import pandas as pd
import pickle
from collections import Counter
from bs4 import BeautifulSoup as bs
import re,csv, os, itertools, pandas as pd,docx2txt
from tqdm import tqdm
from pattern.web import PDF
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize
from spacy.lang.id import Indonesian
from html import unescape
from unidecode import unidecode
from bz2 import BZ2File as bz2
from textblob import TextBlob
import spacy
from imblearn.over_sampling import RandomOverSampler
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from imblearn.metrics import classification_report_imbalanced


def LoadStopWords(lang):
    L = lang.lower().strip()
    if L == 'en' or L == 'english' or L == 'inggris':
        lemmatizer = WordNetLemmatizer()
        stops =  set([t.strip() for t in LoadDocuments(file = 'C:/WinPython_64bit/notebooks/Google-Play-Store-Review-Extractor-master/stopwords_eng.txt')[0]])
    elif L == 'id' or L == 'indonesia' or L=='indonesian':
        lemmatizer = Indonesian() 
        stops = set([t.strip() for t in LoadDocuments(file = 'C:/WinPython_64bit/notebooks/Google-Play-Store-Review-Extractor-master/stopwords_id.txt')[0]])
    else:
        print('Warning, language not recognized. Empty StopWords Given')
        stops = set(); lemmatizer = None
    return stops, lemmatizer

def fixTags(T):
    getHashtags = re.compile(r"#(\w+)")
    pisahtags = re.compile(r'[A-Z][^A-Z]*')
    t = T
    tagS = re.findall(getHashtags, T)
    for tag in tagS:
        proper_words = ' '.join(re.findall(pisahtags, tag))
        t = t.replace('#'+tag,proper_words)
    return t

def readBz2(file):
    with bz2(file, "r") as bzData:
        txt = []
        for line in bzData:
            try:
                txt.append(line.strip().decode('utf-8','replace'))
            except:
                pass
    return ' '.join(txt)

def LoadDocuments(dPath=None,types=None, file = None): # types = ['pdf','doc','docx','txt','bz2']
    Files, Docs = [], []
    if types:
        for tipe in types:
            Files += crawlFiles(dPath,tipe)
    if file:
        Files = [file]
    if not types and not file: # get all files regardless of their extensions
        Files += crawlFiles(dPath)
    for f in Files:
        if f[-3:].lower()=='pdf':
            try:
                Docs.append(PDF(f).string)
            except:
                print('error reading{0}'.format(f))
        elif f[-3:].lower()=='txt' or f[-3:].lower()=='dic':
            try:
                df=open(f,"r",encoding="utf-8", errors='replace')
                Docs.append(df.readlines());df.close()
            except:
                print('error reading{0}'.format(f))
        elif f[-3:].lower()=='bz2':
            try:
                Docs.append(readBz2(f))
            except:
                print('error reading{0}'.format(f))
        elif f[-4:].lower()=='docx':
            try:
                Docs.append(docx2txt.process(f))
            except:
                print('error reading{0}'.format(f))
        elif f[-3:].lower()=='csv':
            Docs.append(pd.read_csv(f))
        else:
            print('Unsupported format {0}'.format(f))
    if file:
        Docs = Docs[0]
    return Docs, Files

def DelPic(text): #untuk menghilangkan informasi gambar
    D = text.split()
    D = [d for d in D if 'pic.twitter.com' not in d]
    return ' ' .join(D)

def LoadSlang(DirSlang):
    Slangs =LoadDocuments(file = DirSlang)
    SlangDict={}
    for slang in Slangs[0]:
        try:
            key, value = slang.split(':')
            SlangDict[key.strip()] = value.strip()
        except:
            pass
    return SlangDict

#POS Tagging
from nltk.tag import CRFTagger
def postag(text):
    #Tokenisasi Data
    tokenized_sents = word_tokenize(text)
    #pemberian Tag tiap token
    ct = CRFTagger()
    ct.set_model_file('C:/WinPython_64bit/notebooks/Google-Play-Store-Review-Extractor-master/CRFTagger-1.0/CRFTagger/model/model.txt') 
    #directorynya disesuaikan meletakan file crfnya, harus download dlu file crfnya
    pt = ct.tag(tokenized_sents)
    ptN = []
    noun = set(['NN','NNP', 'NNS','NNPS'])
    tmp = []
    for w in pt:
        if w[1] in noun:
            tmp.append(w[0])
    if len(tmp)>0:
        ptN.append(' '.join(tmp))
    return ' '.join(ptN)

def cleanText(T, fix={}, lang = 'id', lemma=None, stops = set(), symbols_remove = False, min_charLen = 0): 
    # lang & stopS only 2 options : 'en' atau 'id'
    # symbols ASCII atau alnum
    pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    t = re.sub(pattern,' ',T) #remove urls if any
    t = DelPic(t)
    t = unescape(t) # html entities fix
    t = fixTags(t) # fix abcDef
    t = t.lower().strip() # lowercase
    t = unidecode(t)
    t = ''.join(''.join(s)[:2] for _, s in itertools.groupby(t)) # remove repetition
    t = sent_tokenize(t) # sentence segmentation. String to list
    for i, K in enumerate(t):
        if symbols_remove:
            K = re.sub(r'[^.,a-zA-Z0-9 \n\.]',' ',K)
        
        cleanList = []
        if lang =='en':
            listKata = word_tokenize(K) # word tokenize
            for token in listKata:
                if token in fix.keys():
                    token = fix[token]
                if lemma:
                    token = lemma.lemmatize(token)
                if stops:
                    if len(token)>=min_charLen and token not in stops:
                        cleanList.append(token)
                else:
                    if len(token)>=min_charLen:
                        cleanList.append(token)
            t[i] = ' '.join(cleanList)
        else:
            if lemma:
                K = lemma(K)
                listKata = [token.text for token in K]
            else:
                listKata = TextBlob(K).words
                
            for token in listKata:
                if token in fix.keys():
                    token = fix[token]
                
                if lemma:
                    token = lemma(token)[0].lemma_
                if stops:    
                    if len(token)>=min_charLen and token not in stops:
                        cleanList.append(token)
                else:
                    if len(token)>=min_charLen:
                        cleanList.append(token)
            t[i] = ' '.join(cleanList)
    return ' '.join(t) 

stops, lemmatizer = LoadStopWords(lang='en')
Slangs=LoadSlang( 'C:/WinPython_64bit/notebooks/Google-Play-Store-Review-Extractor-master/slang.txt')

def predictAspek(text):
      ##Preprocessing
      text = cleanText(text,Slangs, lemma=lemmatizer,lang='en', stops = stops, symbols_remove = True, min_charLen =3)
      text = cleanText(text,fix={}, lemma=lemmatizer,lang='en', stops = stops, symbols_remove = True, min_charLen =3)

      ##Load Vectorized
      tfidf = pickle.load(open("tfidf.pkl", "rb"))
#       print(tfidf)
      test = tfidf.transform([text,''])

      ##Predict
      filename = 'OvR_SVM.pkl'
      clf = pickle.load(open(filename, "rb"))
      label = clf.predict(test[0])
      return label[0]
    
def predictSent(text): # I give label to all data review,= before splitting, and then I saved the tfidf words and model for both aspect and sentiment classification, so after splitting later, label for sentence that has been splitting are given by model 
      ##Preprocessing
      text = cleanText(text,Slangs, lemma=lemmatizer,lang='en', stops = stops, symbols_remove = True, min_charLen =3)
      text = cleanText(text,fix={}, lemma=lemmatizer,lang='en', stops = stops, symbols_remove = True, min_charLen =3)

      ##Load Vectorized
      tfidf = pickle.load(open("tfidf.pkl", "rb"))
#       print(tfidf)
      test = tfidf.transform([text,''])

      ##Predict
      filename = 'bnb_sentimen.pkl'
      clf = pickle.load(open(filename, "rb"))
      label = clf.predict(test[0])
      return label[0]

In [40]:
data = pd.read_excel('C:/WinPython_64bit/notebooks/Google-Play-Store-Review-Extractor-master/tokped_bersih3_label aspek 3.xlsx')
listReview=data['Review']

In [41]:
data.head()

Unnamed: 0,Date,Rating,Sentimen,aspek,Review,Cleaned_review
0,2018-10-17,5,1,helpful,Recomended app,recommended
1,2018-11-12,5,1,pelayanan,I have relied on Tokopedia to obtain from chea...,relied cheap mundane item pricy gadget extra c...
2,2018-11-13,2,-1,pelayanan,What's with OVO shovel treatment? Dissapointed...,ovo shovel treatment dissapointed narrow money...
3,2018-11-13,2,-1,tampilan,The features failed way too much (like the cha...,feature failed much like chat feature crashed ...
4,2018-11-17,5,1,tampilan,"good app, functioned well minimum crash",good functioned well minimum crash


In [42]:
nama_kolom = list(data.columns)

In [43]:
dates =[]
rating=[]
sentimen=[]
aspek=[]
review=[]
clean_rv=[]
for idx,dt in enumerate(data['Review']):
    l_dt = dt.split('.')
    for dts in l_dt:
        if dts!='':
            try:
                dates.append(data[nama_kolom[0]][idx]) 
                rating.append(data[nama_kolom[1]][idx])
                sentimen.append(data[nama_kolom[2]][idx])
                aspek.append(data[nama_kolom[3]][idx])
                review.append(dts)
                clean_rv.append(data[nama_kolom[5]][idx])
            except Exception as err:
                print(err)

In [44]:
dicti = {'Date':dates,'Rating':rating,'Sentimen':sentimen,'aspek':aspek,'Review':review,'clean_review':clean_rv} 
new_data = pd.DataFrame(dicti)

In [45]:
new_data.head()

Unnamed: 0,Date,Rating,Sentimen,aspek,Review,clean_review
0,2018-10-17,5,1,helpful,Recomended app,recommended
1,2018-11-12,5,1,pelayanan,I have relied on Tokopedia to obtain from chea...,relied cheap mundane item pricy gadget extra c...
2,2018-11-12,5,1,pelayanan,Everything were delivered as expected and I a...,relied cheap mundane item pricy gadget extra c...
3,2018-11-12,5,1,pelayanan,"Sometimes, Tokopedi",relied cheap mundane item pricy gadget extra c...
4,2018-11-12,5,1,pelayanan,Full Review,relied cheap mundane item pricy gadget extra c...


In [46]:
new_data.to_excel('split_kalimat.xlsx')
#file ini udah ke-split tapi kolom sentimen dan aspek blm kosong, jd ngikut kalimat awal isinya

### isi kolom aspek dan sentimen

#### aslinya yang dilabelin ada 3067, setelah displit kalimat (jumlah baris/data jadi 4425), aspek dan sentimen menyesuaikan label yang sudah diberikan jadi tidak sesuai dengan kalimatnya, makanya diberikan label aspek dan sentimen dari kalimat yang sudah displit dengan model yang sudah dibuat

In [4]:
df = pd.read_excel('C:/WinPython_64bit/notebooks/Google-Play-Store-Review-Extractor-master/split_kalimat 2.xlsx')
rev=df['Review']

In [48]:
rev.shape

(4425,)

In [50]:
from tqdm import tqdm
for i in tqdm(range(len(df))):
    if pd.isna(df['Sentimen'][i]):
        df['Sentimen'][i] = predictSent(rev[i])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.

  0%|                                                                                 | 2/4425 [00:00<06:50, 10.78it/s]
  0%|                                                                                 | 4/4425 [00:00<06:30, 11.33it/s]
  0%|                                                                                 | 6/4425 [00:00<06:26, 11.44it/s]
  0%|▏                                                                                | 8/4425 [00:00<06:24, 11.49it/s]
  0%|▏                                                                               | 10/4425 [00:00<06:20, 11.62it/s]
  0%|▏                                                                               | 12/4425 [00:01<06:19, 11.62it/s]
  0%|▎                                           

In [51]:
from tqdm import tqdm
for i in tqdm(range(len(df))):
    if pd.isna(df['aspek'][i]):
        df['aspek'][i] = predictAspek(rev[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
100%|██████████████████████████████████████████████████████████████████████████████| 4425/4425 [05:18<00:00, 13.90it/s]


In [52]:
df.to_excel('hasil.xlsx')

In [59]:
from sklearn.multiclass import OneVsRestClassifier

Tfidf_vectorizer = TfidfVectorizer(max_df=0.75, min_df=5)

listdf=listReview.values.astype('U')
listdf = [d for d in listdf]

tfidf = Tfidf_vectorizer.fit_transform(listdf)
tfidf_term = Tfidf_vectorizer.get_feature_names()

Pkl_Filename = 'tfidf2.pkl'
with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(Tfidf_vectorizer, file)
print(Tfidf_vectorizer)

with open(Pkl_Filename, 'rb') as file:  
    vsm = pickle.load(file)
print(vsm)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.75, max_features=None, min_df=5,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.75, max_features=None, min_df=5,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)


In [73]:
X = tfidf
y = data['Sentimen'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3)

In [78]:
negatif = []
positif = []

ros = RandomOverSampler(random_state=1)
X_ros, y_ros = ros.fit_resample(X_train, y_train)
bnb = BernoulliNB()
NB = bnb.fit(X_ros, y_ros)
y_bnb = bnb.predict(X_test); del bnb

array([ 1, -1,  1, -1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1,  1, -1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,  1,  1,  1,  1,  1,
       -1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,
        1, -1,  1,  1,  1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1,
        1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1, -1,  1,  1,  1,  1,  1,
       -1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,
        1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,
        1, -1,  1,  1,  1

In [96]:
for i in range(len(y_bnb)):
    if y_bnb[i] == -1: 
        rev[i]="negatif"
    else:
        rev[i]="positif"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### test input data

In [17]:
test_reviews = [
    "Good, fast service.",
    "get more cashback with many coupons.",
    "I can not use my coupon while i have right in term and condition for that."]

for i in range(3):
    print("Review " + str(i) + " memiliki aspek " + predictAspek(rev[i+1]) + " memiliki sentimen", predictSent(rev[i])) 

Review 0 memiliki aspek helpful memiliki sentimen 1
Review 1 memiliki aspek pengalaman belanja memiliki sentimen -1
Review 2 memiliki aspek helpful memiliki sentimen -1
