In [1]:
import urllib.request as rq
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import sys,regex as rgx,json, time
import pprint

class FBImporter:
    'Importing comments based on Facebook User'
    fbPrefixUrl = r'https://graph.facebook.com/v2.12/'
    fbCommentUrl = r'/comments?fields=message%2Cmessage_tags%2Ccreated_time%2Ccomment_count%2Cid'
    fbPostsUrl = r'/posts?'

    def __init__(self , pageUrl, access_token):
        self.pageUrl = pageUrl
        self.access_token = '&access_token=' + access_token

    def createPostURL(self):
        url = self.fbPrefixUrl + self.entityId + self.fbPostsUrl + self.access_token
        return url

    def createCommentURL(self, post_id):
        url = self.fbPrefixUrl + post_id + self.fbCommentUrl + self.access_token
        return url

    def setGroupId(self):
        """Get Facebook group id based on group link"""
        response = rq.urlopen(self.pageUrl)
        if response.getcode() == 200:
            html = response.read()
            soup = BeautifulSoup(html, 'html.parser')
            scripts = soup.find_all('script')
            rgxSearch = rgx.compile(r'entity_id(?:\'|"):(?:\'|")([\d]+)(?:\'|")')
            entityIds = [rgxSearch.search(x.text).group(1) for x in scripts if rgxSearch.search(x.text) is not None]
            self.entityId = entityIds[0]
            print(self.entityId)
        else:
            self.entityId = ""

    def getPostList(self):
        postUrl = self.createPostURL()
        print(postUrl)
        result = self.getItemPerPage(postUrl)
        return result

    def getCommentList(self, post_id):
        comment_url = self.createCommentURL(post_id)
        result = self.getItemPerPage(comment_url)
        return result

    def getItemPerPage(self, url, include_sub = False):
        """Import data from Facebook according to the Graph API URL"""
        next_link = None
        response_comment = rq.urlopen(url)
        if response_comment.getcode() == 200:
            decoded_json_comment = json.loads(response_comment.read().decode('utf-8'))
            if 'error' in decoded_json_comment:
                return None
            else:
                data = decoded_json_comment['data']                
                if 'paging' in decoded_json_comment and 'next' in decoded_json_comment['paging']:
                    next_link = decoded_json_comment['paging']['next']
                return (data, next_link)
        return None

    def getAllItems(self, start_url, item_type='post'):
        all_comments = []
        current_comments, next_link = self.getItemPerPage(start_url)
        all_comments += current_comments
        while next_link is not None:
            time.sleep(2)
            result = self.getItemPerPage(next_link)
            if result == None:
                next_link = None
                break
                
            current_comments, next_link = result    
            all_comments += current_comments
        
        # If comments, need to alter the result
        if item_type == 'comment':
            for comment in all_comments:
                comment['parent_id'] = comment['id']
                if 'message_tags' in comment:
                    msg, names = self.removetaggedName(comment['message'], comment['message_tags'])
                    comment['message'] = msg
                    comment['tagged_names'] = names
                    comment.pop('message_tags', None) ##remove tag information
                else:
                    msg = comment['message']
                    
        return all_comments
    
    def getSubItems(self, record_array):
        sub_array = []
        for item in record_array:
            if item['comment_count'] > 0:
                url = self.createCommentURL(item['id'])
                current_sub = self.getAllItems(url, item_type='comment')
                
                #save parent_id for reference             
                for current_sub_item in current_sub:
                    current_sub_item['parent_id'] = item['id'] 
                
                sub_array += current_sub  
                    
        return sub_array
    
    def removetaggedName(self, text, tags):
        names = '|'.join([ x['name'] for x in tags])
        msg_wo_name = rgx.sub(names, '', text).strip()
        return msg_wo_name, names

    def convertToWords(self, series):
        all_set=set([])
        for curr_tuple in series.iteritems():
            curr_line = rgx.sub(r'[\.]+', ' ', curr_tuple[1].lower())
            clean_line = curr_line.split(' ')
            all_set = all_set.union(set(clean_line))

        return pd.Series(list(all_set))

In [15]:
fb = FBImporter(r'https://www.facebook.com/menatesteakhub/', r'EAACEdEose0cBAJss0WC82ZB07ZB2UdpakxUevu1ZBhEVX7TnWJKmqcUdZC8A6RJijDy4q1ry7Fjh6UZCzohDc12lohyoc9s1aV4KzJAiBxBQ1ObDDFQ61HionmgSfrUMpswO2KSYOGPtVLcZCWe97L5jJmzZBBGeK8mHsQ7rZAxipCfx246l5oxcc0tMFCxPuOTvKCkBjm4IpwZDZD')
fb.setGroupId()
test = fb.getPostList()
ids = [ x['id'] for x in test[0] ]

242409295818219
https://graph.facebook.com/v2.12/242409295818219/posts?&access_token=EAACEdEose0cBAJss0WC82ZB07ZB2UdpakxUevu1ZBhEVX7TnWJKmqcUdZC8A6RJijDy4q1ry7Fjh6UZCzohDc12lohyoc9s1aV4KzJAiBxBQ1ObDDFQ61HionmgSfrUMpswO2KSYOGPtVLcZCWe97L5jJmzZBBGeK8mHsQ7rZAxipCfx246l5oxcc0tMFCxPuOTvKCkBjm4IpwZDZD


In [16]:
comment_url = fb.createCommentURL('242409295818219_1801246449934488')
comment_url

'https://graph.facebook.com/v2.12/242409295818219_1801246449934488/comments?fields=message%2Cmessage_tags%2Ccreated_time%2Ccomment_count%2Cid&access_token=EAACEdEose0cBAJss0WC82ZB07ZB2UdpakxUevu1ZBhEVX7TnWJKmqcUdZC8A6RJijDy4q1ry7Fjh6UZCzohDc12lohyoc9s1aV4KzJAiBxBQ1ObDDFQ61HionmgSfrUMpswO2KSYOGPtVLcZCWe97L5jJmzZBBGeK8mHsQ7rZAxipCfx246l5oxcc0tMFCxPuOTvKCkBjm4IpwZDZD'

In [17]:
arr = fb.getAllItems(comment_url, item_type='comment')
dfx = pd.DataFrame(arr)

In [18]:
dfx.head(10)

Unnamed: 0,comment_count,created_time,id,message,parent_id,tagged_names
0,7,2018-05-24T14:33:44+0000,1801246449934488_173808776616717,Alhamdulillah... hari ini berpeluang berbuka b...,1801246449934488_173808776616717,
1,3,2018-05-20T12:13:22+0000,1801246449934488_1993896347288475,Recommended 👍 Macam² pilihan ada especially we...,1801246449934488_1993896347288475,
2,8,2018-05-17T13:39:50+0000,1801246449934488_1807619769297156,Baby below 2 years kene charge jugak ke?,1801246449934488_1807619769297156,
3,2,2018-05-21T11:01:56+0000,1801246449934488_229417101157961,Nak kena booking or boleh walk in terus,1801246449934488_229417101157961,
4,2,2018-05-20T13:47:00+0000,1801246449934488_617377048614914,Seorang boleh tak? Atau seorang tidak dibenark...,1801246449934488_617377048614914,
5,3,2018-05-20T03:46:02+0000,1801246449934488_1715180305233788,Ada tempat untuk solat di Cawangan Bangi?,1801246449934488_1715180305233788,
6,3,2018-05-22T04:52:48+0000,1801246449934488_174392336601122,Tmpt solat cawangan setapak ada?,1801246449934488_174392336601122,
7,3,2018-05-13T03:54:55+0000,1801246449934488_1803119599747173,Kalau gi sorang2 boleh ke?,1801246449934488_1803119599747173,
8,1,2018-05-21T12:04:26+0000,1801246449934488_225612748202444,Dah pi Ahad lepas. Makanan sedap. Terbaik,1801246449934488_225612748202444,
9,15,2018-05-12T06:37:20+0000,1801246449934488_1802088483183618,Ada ruangan khas untuk sembahyang tarawikh tak...,1801246449934488_1802088483183618,


In [19]:
dfx.shape

(763, 6)

In [7]:
dfx.to_csv('D:\Qayyuum\menate_buka_puasa4.csv')

In [8]:
dfx.id[0]

'1801246449934488_173808776616717'

In [9]:
fb.createCommentURL(dfx.id[0])

'https://graph.facebook.com/v2.12/1801246449934488_173808776616717/comments?fields=message%2Cmessage_tags%2Ccreated_time%2Ccomment_count%2Cid&access_token=EAACEdEose0cBAJss0WC82ZB07ZB2UdpakxUevu1ZBhEVX7TnWJKmqcUdZC8A6RJijDy4q1ry7Fjh6UZCzohDc12lohyoc9s1aV4KzJAiBxBQ1ObDDFQ61HionmgSfrUMpswO2KSYOGPtVLcZCWe97L5jJmzZBBGeK8mHsQ7rZAxipCfx246l5oxcc0tMFCxPuOTvKCkBjm4IpwZDZD'

In [10]:
current_sub = fb.getAllItems(fb.createCommentURL(dfx.id[0]), item_type='comment')

In [11]:
current_sub

[{'comment_count': 0,
  'created_time': '2018-05-25T02:09:37+0000',
  'id': '1801246449934488_1950494531928859',
  'message': 'Terima kasih  . Jemput datang lagi :)',
  'parent_id': '1801246449934488_1950494531928859',
  'tagged_names': 'Hayati Yatt'},
 {'comment_count': 0,
  'created_time': '2018-05-25T06:35:55+0000',
  'id': '1801246449934488_2062756094045024',
  'message': 'Ruang solat besar ker??',
  'parent_id': '1801246449934488_2062756094045024'},
 {'comment_count': 0,
  'created_time': '2018-05-25T06:39:00+0000',
  'id': '1801246449934488_1285185234958592',
  'message': 'ye selesaaaaaa',
  'parent_id': '1801246449934488_1285185234958592',
  'tagged_names': 'Amy Lia'},
 {'comment_count': 0,
  'created_time': '2018-05-25T06:40:03+0000',
  'id': '1801246449934488_1285185688291880',
  'message': 'selalu mesti org akan serbu surau kn lg2 maghrib pendek..kalau sy bw mak yg solat duduk ok ker',
  'parent_id': '1801246449934488_1285185688291880',
  'tagged_names': 'Hayati Yatt'},
 {'co

In [13]:
for item in current_sub:
    item['parent_id'] = dfx.id[0]
current_sub

[{'comment_count': 0,
  'created_time': '2018-05-25T02:09:37+0000',
  'id': '1801246449934488_1950494531928859',
  'message': 'Terima kasih  . Jemput datang lagi :)',
  'parent_id': '1801246449934488_173808776616717',
  'tagged_names': 'Hayati Yatt'},
 {'comment_count': 0,
  'created_time': '2018-05-25T06:35:55+0000',
  'id': '1801246449934488_2062756094045024',
  'message': 'Ruang solat besar ker??',
  'parent_id': '1801246449934488_173808776616717'},
 {'comment_count': 0,
  'created_time': '2018-05-25T06:39:00+0000',
  'id': '1801246449934488_1285185234958592',
  'message': 'ye selesaaaaaa',
  'parent_id': '1801246449934488_173808776616717',
  'tagged_names': 'Amy Lia'},
 {'comment_count': 0,
  'created_time': '2018-05-25T06:40:03+0000',
  'id': '1801246449934488_1285185688291880',
  'message': 'selalu mesti org akan serbu surau kn lg2 maghrib pendek..kalau sy bw mak yg solat duduk ok ker',
  'parent_id': '1801246449934488_173808776616717',
  'tagged_names': 'Hayati Yatt'},
 {'commen

In [20]:
sub_comment = fb.getSubItems(arr)

In [21]:
dfsub = pd.DataFrame(sub_comment)
dfsub

Unnamed: 0,comment_count,created_time,id,message,parent_id,tagged_names
0,0,2018-05-25T02:09:37+0000,1801246449934488_1950494531928859,Terima kasih . Jemput datang lagi :),1801246449934488_173808776616717,Hayati Yatt
1,0,2018-05-25T06:35:55+0000,1801246449934488_2062756094045024,Ruang solat besar ker??,1801246449934488_173808776616717,
2,0,2018-05-25T06:39:00+0000,1801246449934488_1285185234958592,ye selesaaaaaa,1801246449934488_173808776616717,Amy Lia
3,0,2018-05-25T06:40:03+0000,1801246449934488_1285185688291880,selalu mesti org akan serbu surau kn lg2 maghr...,1801246449934488_173808776616717,Hayati Yatt
4,0,2018-05-25T06:41:14+0000,1801246449934488_2062758627378104,bole.... sgt2 bole sis...,1801246449934488_173808776616717,Amy Lia
5,0,2018-05-25T06:41:34+0000,1801246449934488_2062758737378093,Saya pon solat duduk,1801246449934488_173808776616717,
6,0,2018-05-25T06:42:03+0000,1801246449934488_1437658463046695,ooo ok2 tq..,1801246449934488_173808776616717,Hayati Yatt
7,0,2018-05-23T01:59:31+0000,1801246449934488_271842083358077,Terima kasih. Jemput datang lagi ya 😊,1801246449934488_1993896347288475,
8,0,2018-05-24T00:42:13+0000,1801246449934488_122554675287614,review ni ok plk,1801246449934488_1993896347288475,Suhana Mat Rasu
9,0,2018-05-24T00:43:29+0000,1801246449934488_419562051843940,ikut branch kot,1801246449934488_1993896347288475,Fyda Ali


In [22]:
all_comments= arr + sub_comment

In [23]:
len(all_comments)

1747

In [24]:
df = pd.DataFrame(all_comments)
df.columns

Index(['comment_count', 'created_time', 'id', 'message', 'parent_id',
       'tagged_names'],
      dtype='object')

In [25]:
df = df[['parent_id', 'id', 'message', 'tagged_names','created_time','comment_count']]
df.head()

Unnamed: 0,parent_id,id,message,tagged_names,created_time,comment_count
0,1801246449934488_173808776616717,1801246449934488_173808776616717,Alhamdulillah... hari ini berpeluang berbuka b...,,2018-05-24T14:33:44+0000,7
1,1801246449934488_1993896347288475,1801246449934488_1993896347288475,Recommended 👍 Macam² pilihan ada especially we...,,2018-05-20T12:13:22+0000,3
2,1801246449934488_1807619769297156,1801246449934488_1807619769297156,Baby below 2 years kene charge jugak ke?,,2018-05-17T13:39:50+0000,8
3,1801246449934488_229417101157961,1801246449934488_229417101157961,Nak kena booking or boleh walk in terus,,2018-05-21T11:01:56+0000,2
4,1801246449934488_617377048614914,1801246449934488_617377048614914,Seorang boleh tak? Atau seorang tidak dibenark...,,2018-05-20T13:47:00+0000,2


In [29]:
df2 = df.sort_values(axis=0, by=['parent_id','id'], ascending=True, inplace=False).reset_index()
df2.head()

Unnamed: 0,index,parent_id,id,message,tagged_names,created_time,comment_count
0,573,1801246449934488_1012345632247146,1801246449934488_1012345632247146,Fieda,Fieda Danisya,2018-05-23T08:51:45+0000,7
1,1613,1801246449934488_1012345632247146,1801246449934488_102880223934688,Alolooo.. ok2 t i roger.. x de geng ni.. 31,,2018-05-23T10:06:26+0000,0
2,1615,1801246449934488_1012345632247146,1801246449934488_102882320601145,"Hahaaa, tau da. Ok t i roger",,2018-05-23T10:08:49+0000,0
3,1612,1801246449934488_1012345632247146,1801246449934488_170760513765993,hahahaha.. meme tak la. lepas selai baju raya ...,ZulFahmi Qarier Kamal,2018-05-23T10:04:51+0000,0
4,1614,1801246449934488_1012345632247146,1801246449934488_170761187099259,okeh.. klu u lanjer i anytime 😁😁,ZulFahmi Qarier Kamal,2018-05-23T10:07:39+0000,0


In [31]:
df2.loc[df2['message'] == ''] = np.nan

In [33]:
df2.dropna(axis=0, subset=['message'], inplace=True)

In [34]:
df2.shape

(1324, 7)

In [36]:
df2['message'].to_excel(r'D:\Qayyuum\menate_ramadan_offer2.xlsx')

In [None]:
only_word = fb.convertToWords(df['message'])

In [None]:
only_word

In [None]:
srs = pd.read_csv(r'D:\Qayyuum\burger.csv', index_col=0)

In [None]:
srs

In [None]:
fb = FBImporter(r'https://www.facebook.com/khairul.ezuwan.1/', r'643368762522881|c4bf0124fa9dffe8b824e9b4e4e50ff8')
fb.setGroupId()
test = fb.getPostList()
ids = [ x['id'] for x in test[0] ]

In [None]:
ids

In [None]:
df = pd.read_csv("D:\Qayyuum\kezuwan.csv", index_col=0)
df.head(20)

In [None]:
urlStopWords = 'http://blog.kerul.net/2014/01/list-of-malay-stop-words.html'
response = rq.urlopen(urlStopWords)

In [None]:
response.getcode()

In [None]:
html = response.read()
soup = BeautifulSoup(html, 'html.parser')

In [None]:
tables = soup.find_all('table')

In [None]:
divs = tables[0].find_all('div')

In [None]:
word = [x.text.replace('\n','') for x in divs]

In [None]:
word

In [None]:
dfStopWords = pd.DataFrame(word,columns=['Stop_Word'])

In [None]:
dfStopWords.head()

In [None]:
dfStopWords.to_csv('D:\Qayyuum\stopword.csv')

In [59]:
def levenshtein(s, t):
        ''' From Wikipedia article; Iterative with two matrix rows. '''
        if s == t: return 0
        elif len(s) == 0: return len(t)
        elif len(t) == 0: return len(s)
        v0 = [None] * (len(t) + 1)
        v1 = [None] * (len(t) + 1)
        for i in range(len(v0)):
            v0[i] = i
        for i in range(len(s)):
            v1[0] = i + 1
            for j in range(len(t)):
                cost = 0 if s[i] == t[j] else 1
                v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost)
            for j in range(len(v0)):
                v0[j] = v1[j]
                
        return v1[len(t)]

In [12]:
levenshtein('yang', 'yg')

2

SoupStrainer helps to parse specific element only if you know what to extract beforehand

In [21]:
import requests
import regex as rgx
from bs4 import BeautifulSoup, SoupStrainer

def parseWord(line):
    if ',' in line:
        curr_line = line.split(',')
        for word in curr_line:
            if '\n' in word:                
                for split_word in word.split('\n'):
                    yield split_word
            else:
                yield word
    else:
        if '\n' not in line:                
            yield line

def getAlphabetLinks():
    url = r'https://ms.wiktionary.org/wiki/Wiktionary:Senarai_perkataan_mengikut_susunan_abjad'
    rs = requests.get(url)
    soup_all_alpha = BeautifulSoup(rs.content, 'html.parser', parse_only=SoupStrainer("a"))

    for x in soup_all_alpha:
        if x.has_attr('href') and rgx.match(r'^.*:Senarai_perkataan_[A-Z]$', x['href']):
            yield x['href']
    
def getWordPerAlphabet(link):
    rs = requests.get(link)
    soup = BeautifulSoup(rs.content, 'html.parser')
    tab_of_words = soup.find_all("table")

    ## get all words
    words = []
    for item in tab_of_words:
        word = [y.text for y in item.find_all("li")]
        for i, curr_word in enumerate(word):
            if '\n' in curr_word:
                word[i] = curr_word.split('\n')[0]
        words += word

    clean_list = [item.strip() for x in words for item in parseWord(x)]
    return clean_list

In [22]:
full_link = list(map(lambda x: r'https://ms.wiktionary.org' + x, getAlphabetLinks()))

In [24]:
all_words = []
for link in full_link:
    all_words += getWordPerAlphabet(link)

In [25]:
len(all_words)

28302

In [48]:
sample_str = "Sy tidak gembira, sy menginap di hotel 1 mlm utk 2 bilik, manager security sombong dan berlagak." + \
             " Sy terpaksa tanggung kos repair akibat bonet kereta sy dicalar ditempat parking kereta yg disediakan oleh pihak hotel...." + \
             "ini lah kali pertama, sy menginap di hotel yg tidak selamat..." + \
             "sy harap pihak agoda, tidak bekerjasama dgn hotel ini kerana amat merbahayakan pelanggan2 & pelanggan " + \
             "pula terpaksa tanggung risiko jika terjadi ape2."


In [49]:
sample_str = rgx.sub(r'\.{2,}','. ', sample_str) 

In [50]:
from nltk.tokenize import word_tokenize, sent_tokenize
sentences =  sent_tokenize(sample_str)

In [51]:
sentences

['Sy tidak gembira, sy menginap di hotel 1 mlm utk 2 bilik, manager security sombong dan berlagak.',
 'Sy terpaksa tanggung kos repair akibat bonet kereta sy dicalar ditempat parking kereta yg disediakan oleh pihak hotel.',
 'ini lah kali pertama, sy menginap di hotel yg tidak selamat.',
 'sy harap pihak agoda, tidak bekerjasama dgn hotel ini kerana amat merbahayakan pelanggan2 & pelanggan pula terpaksa tanggung risiko jika terjadi ape2.']

In [55]:
alpha_word = [x for x in word_tokenize(sentences[0]) if x.isalpha()]

In [57]:
[a for a in alpha_word if a not in all_words]

['Sy', 'sy', 'mlm', 'utk', 'manager', 'security']

In [58]:
from collections import OrderedDict
items = [a for a in alpha_word if a not in all_words]
list(OrderedDict.fromkeys(items))

['Sy', 'sy', 'mlm', 'utk', 'manager', 'security']

In [60]:
duplicate = rgx.compile(r'(.)\1{2,}')
duplicate.search('baaaikkkk')

<regex.Match object; span=(1, 4), match='aaa'>

In [70]:
duplicate.sub(r'\1\1','baaaikkkk')

'baaikk'

In [68]:
with open(r'D:\Data Science\Hotel-Reviews-BM\words_alpha.txt','r', encoding='utf-8') as eng_file:
    eng_dict = eng_file.readlines()

eng_file.close()

eng_dict = [rgx.sub(r'\n','', x) for x in eng_dict]

In [69]:
eng_dict[150]

'abater'

## TODO

### Methods for text preprocessing

1. Remove repeatable non-characters (to avoid false sentence tokenization).
2. Tokenize the sentence.
3. Iterate each word and validates:
   1. Check for multiple characters - give exception for character such as a,g,n,k
   2. Compare with Malay dictionary
   3. Compare with short form database, if available, then replace.

In [9]:
import regex as rgx
from nltk.tokenize import word_tokenize, sent_tokenize

# remove repeatable non-characters
def remove_rep_non_chars(txt):
    pattern = rgx.compile(r'([\W_])\1{2,}')
    return pattern.sub(r'\1', txt)

# pad fullstop with space, since tokenizer cannot detect dot if no space after it
def remove_multi_dots(txt):
    pattern = rgx.compile(r'(\w)(\.{2,})')
    return pattern.sub(r'\1. ', txt)

def pad_dot_with_space(txt):
    pattern = rgx.compile(r'(\w)\.(\w)')
    return pattern.sub(r'\1. \2', txt)

def remove_new_line(txt):
    pattern = rgx.compile(r'(\r|\n)?')
    return pattern.sub('', txt).strip()

def tokenize_sentence(paragraph_str):
    paragraph = pad_dot_with_space(remove_rep_non_chars(paragraph_str))
    sentences = sent_tokenize(paragraph)
    return sentences

def load_dictionary(path):
    with open(path, 'r', encoding='utf-8') as dict_file:
        dict_array = dict_file.readlines()
    dict_file.close()
    return [rgx.sub(r'\n', '', word) for word in dict_array]
    
def validate_sentence(sentence):
    # initialize BM dictionary
    BM_dict = load_dictionary(r'D:\Data Science\Hotel-Reviews-BM\dict-ms-wiki.txt')
    
    # initialize English dictionary
    ENG_dict = load_dictionary(r'D:\Data Science\Hotel-Reviews-BM\words_alpha.txt')
    
    # break sentence into array of words
    tokenized_sent = word_tokenize(sentence.lower())
    
    # check for BM words except punctuation
    validated_BM = list(map(lambda x: x in all_words or not x.isalpha(), tokenized_sent))
    
    for i,x in enumerate(validated_BM):
        if x is False:
            print(tokenized_sent[i] in eng_dict)

In [7]:
import regex as rgx
class RepeatReplacer(object):
    def __init__(self):
        self.repeat_regexp = rgx.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
    
    def replace(self, word):
        repl_word = self.repeat_regexp.sub(self.repl, word)
        #print(repl_word, word)
        if repl_word != word:
            return self.replace(repl_word)
        else:
            return repl_word

In [8]:
replacer = RepeatReplacer()
replacer.replace('looooove')

loooove looooove
looove loooove
loove looove
love loove
love love


'love'

In [14]:
sample_str2 = 'TMJ bg rm1 juta utk shopping..1 family max rm3k blh belanja..total 333 family blh scan bayar kt kaunter..bila kau dh busy kehulu kehilir cari brg2,org beratur kt kaunter dh berbaris panjang..bayangkan 1 family 3k pny barang nk scan..ambil masa..nk packing lg..bila tgk org membazir mcm ni,rasa rugi sgt walaupn org tlg bayarkan..mgkn ada org miskin tgh gaul nasi dgn garam pn kita x tau..'

paragraph = pad_dot_with_space(remove_rep_non_chars(remove_multi_dots(sample_str2)))

In [15]:
sentence = sent_tokenize(paragraph)
sentence

['TMJ bg rm1 juta utk shopping.',
 '1 family max rm3k blh belanja.',
 'total 333 family blh scan bayar kt kaunter.',
 'bila kau dh busy kehulu kehilir cari brg2,org beratur kt kaunter dh berbaris panjang.',
 'bayangkan 1 family 3k pny barang nk scan.',
 'ambil masa.',
 'nk packing lg.',
 'bila tgk org membazir mcm ni,rasa rugi sgt walaupn org tlg bayarkan.',
 'mgkn ada org miskin tgh gaul nasi dgn garam pn kita x tau.']

In [107]:
tokenized_sent = word_tokenize(sentence[2].lower())

In [125]:
tokenized_sent

['tampa',
 'ada',
 'breakfast',
 'dan',
 'keadaan',
 'bilik',
 'sempit',
 'dan',
 'juga',
 'wifi',
 'sangat',
 'perlahan',
 'kadang2',
 'terputus',
 'sambungan',
 '.']

In [121]:
# pass tokenized sentence, and will check against BM dictionary
# def checkBasicDict(word):
#     if word in all_words:
#         return True
BM_dict = load_dictionary(r'D:\Data Science\Hotel-Reviews-BM\dict-ms-wiki.txt')
validated_BM = list(map(lambda x: x in BM_dict or not x.isalpha(), tokenized_sent))

In [126]:
# check for english word
for i,x in enumerate(validated_BM):
    print(tokenized_sent[i], x)
#     if x is False:
#         print(tokenized_sent[i], tokenized_sent[i] in eng_dict)

tampa False
ada True
breakfast False
dan True
keadaan True
bilik True
sempit True
dan True
juga True
wifi False
sangat True
perlahan True
kadang2 True
terputus True
sambungan True
. True


In [2]:
df_hotel_review = pd.read_excel(r'D:\Data Science\Hotel-Reviews-BM\all-reviews.xlsx')

In [3]:
df_hotel_review.shape

(1206, 4)

In [4]:
df_hotel_review.columns

Index(['title', 'rating', 'reviews', 'Polarity'], dtype='object')

In [5]:
df_review = df_hotel_review['reviews']

In [87]:
df_review

0       Sy tidak gembira, sy menginap di hotel 1 mlm u...
1                                                   oklah
2       Pengalaman saya tinggal di hotel grand seasons...
3       Bilik kecil, toilet tak bersih.. Nak angkut ba...
4       1st-CHECK IN\r\r\ntidak suka dgn receptionist ...
5                                Menyesal pilih hotel ini
6       kalau saya ke malaka saya akan kembali ke hatt...
7       Penyambut tetamu yang tidak mesra, staf y berl...
8       Menguntungkan dan paling baik untuk bawa kelua...
9       semasa daftar masuk layanan staff kurang memua...
10                                                 Bagus!
11                             sinki tersumbat..ada patut
12      Sangat mengembirkan dan menyoronokkan di mana ...
13      Sarapan 1 kamar hanya untuk 1 org saja, seharu...
14      Katil rasa spring, selimut cadar bantal berbau...
15      Perjalanan yang agak sesak sepanjang jalan men...
16      bathroom kotor dan facilities teruk ( kepala p...
17      kekura

In [90]:
paragraph = remove_new_line(pad_dot_with_space(remove_multi_dots(df_review[4])))
sent_tokenize(paragraph)

['1st-CHECK INtidak suka dgn receptionist dia yg sombong.',
 'Mcm pndg rendah pada org mcm i. Ye la pakai myvi buruk je.',
 'mesti dia dh nmpk I keluar drp myvi buruk I tu.',
 'I pula baru check in around 12am sbb I tidak blh balik to my house in Bentong regarding tanah runtuh di Karak Highway.',
 'anyway masa I register tu, I biarkan aje.',
 'always positif kan. then dia bagi I kunci dgn jeling mata shj.',
 'ok ok fine lah.',
 'I ambil bilik+breakfast, and of course I tny dia what is the time for breakfast.',
 'dia ckp " erm, u ada ambil sekali breakfast ke?"',
 'With jeling mata lagi.',
 'then I say "Yes(senyum sambil bersabar)" sambil tunjuk my voucher thru my phone.',
 'then dia kata"ok, esok pagi u tunjukkan that voucher je"dgn jelingan yg \'cute\' tu.',
 'ok then.',
 '2nd-BILIKonce dh masuk bilik.',
 'nmpk malap je.',
 'bnyk lampu tidak berfungsi.',
 'Toilet pula tidak boleh digunakan dgn baik.',
 'Tiada brg2 keperluan SEPERTI Berus Gigi disediakan.',
 '3rd-BREAKFASTbila kami nak

In [20]:
patt = rgx.compile(r'(\w)\.{2,}(\w)')
patt.search(paragraph)

In [80]:
sentences = []
for idx, val in df_review.iteritems():
    paragraph = remove_new_line(pad_dot_with_space(remove_multi_dots(val)))
    sentence = sent_tokenize(paragraph)
    sentences += sentence

In [81]:
srs_review = pd.Series(sentences)

In [83]:
srs_review.to_excel(r'D:\Data Science\Hotel-Reviews-BM\all-reviews-sent-only.xlsx')

In [72]:
remove_dots = rgx.sub(r'(\w)(\.{2,})', r'\1. ' , df_review[25])
clean_sent = rgx.sub(r'(\r|\n)?', '' , remove_dots).strip()

In [73]:
clean_sent = rgx.sub(r'(\r|\n)?', '' , remove_dots).strip()

In [74]:
sent_tokenize(clean_sent)

['Memandangkan kami sekeluarga (2 dewasa,1kanak2&1 bayi) check in pada hari Ahad maka masa untuk beratur di kaunter sahaja mengambil masa lebih 1 jam.',
 'dengan keadaan yang tidak selesa.',
 'kerusi untuk orang yang menunggu untuk masuk ke bilik juga agak terhad.',
 'servis dari hotel staf yang mengambil masa sangat lama membuatkan ramai orang gelisah.',
 'tambahan pula yang datang berkeluarga dengan anak2 kecil.',
 'kolam mandi agak kecil, tp orang begitu ramai.',
 'beratur panjang utk mkn pagi.',
 'tp pihak hotel buka meja kat ballroom.',
 'takde bathtub dalam bilik, bilik shower ja.']

In [31]:
df_review[25]

'Memandangkan kami sekeluarga (2 dewasa,1kanak2&1 bayi) check in pada hari Ahad maka masa untuk beratur di kaunter sahaja mengambil masa lebih 1 jam..dengan keadaan yang tidak selesa..kerusi untuk orang yang menunggu untuk masuk ke bilik juga agak terhad..servis dari hotel staf yang mengambil masa sangat lama membuatkan ramai orang gelisah..tambahan pula yang datang berkeluarga dengan anak2 kecil..\r\r\nkolam mandi agak kecil, tp orang begitu ramai..\r\r\nberatur panjang utk mkn pagi..tp pihak hotel buka meja kat ballroom..\r\r\ntakde bathtub dalam bilik, bilik shower ja..'