In [1]:
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
說明：
    做資料篩選

注意事項：
    1. 先把mongoDB建好，資料匯好
    2. 資料庫的名字可以自己取，如果不一樣程式記得改
    3. 我的做法是Yelp_Original用來存原始yelp資料、Yelp_New是我篩選後的資料
    4. 資料表建議一樣用business、user、review
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''

# 先import會用到的庫
from pymongo import MongoClient
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import time
from tqdm import tqdm

In [2]:
### 1.篩選PA州評論數量為15~200的POI
client = MongoClient('127.0.0.1', 27017)
db = client.Yelp_Original
business = db.business
review = db.review

db_new = client.Yelp_New
business_new = db_new.business

def getIds():
    ids = list()
    condition = [
        {
            '$match':{
                'state': 'PA'
            }
        },{
            '$group':{
                '_id': '$business_id',
                'count': {'$sum':1}
            }
        }
    ]
    for i in business.aggregate(condition):
        if i['count'] == 1 and i != None and i != '':
            ids.append(i['_id'])
    return ids

def checkReview(ids):
    bar = tqdm(total=len(ids), desc='Check Review')
    count = 0
    for id in ids:
        cond1 = {'business_id': id}
        # 篩選星星數大於3的，如果要用在把註解拿掉，第二個temp刪掉
        # cond2 = {'stars': {'$gte': 3}}
        # temp = review.count_documents({"$and": [cond1, cond2]})
        temp = review.count_documents(cond1)
        if temp >= 15 and temp <= 200:
            insert_business(id)
            count = count + 1
        bar.update(1)
    bar.close()
    return count
        
def insert_business(_id):
    item = business.find({'business_id': _id})
    for i in item:
        business_new.insert_one(i)
    item.close()

tempIds = getIds()
total = checkReview(tempIds)
tqdm.write('Total Record: ' + str(total))

Check Review: 100%|██████████| 34039/34039 [20:35:13<00:00,  2.18s/it]   

Total Record: 16476





In [13]:
### 2.將第一步的POI對應的評論插入到新的review表
client = MongoClient('127.0.0.1', 27017)
db = client.Yelp_Original
review = db.review

db_new = client.Yelp_New
business_new = db_new.business
review_new = db_new.review

businessSize = business_new.count_documents({})

def getIds():
    bar = tqdm(total=businessSize, desc='Processing')
    tempIds = business_new.find({}, no_cursor_timeout=True, batch_size=10)
    for _id in tempIds:
        insert_review(_id['business_id'])
        bar.update(1)
    tempIds.close()
    bar.close()

def insert_review(_id):
    condition = [
        {
            '$match':{
                'business_id': _id
            }
        }
    ]
    for item in review.aggregate(condition):
        review_new.insert_one(item)

getIds()
tqdm.write('Total Record: ' + str(review_new.count_documents({})))

Processing: 100%|██████████| 16476/16476 [14:50:49<00:00,  3.24s/it]  




[A[A[A[A                                                    


[A[A[A                                                       


[A[A[A

[A[A

Total Record: 831070


In [2]:
### 3.篩選review表，刪除字數小於60或日期<2017的評論
client = MongoClient('localhost', 27017)
db_new = client.Yelp_New
review_new = db_new.review

reviewSize = review_new.count_documents({})

def getIds():
    bar = tqdm(total=reviewSize, desc='Delete Review')
    tempIds = review_new.find({}, no_cursor_timeout=True, batch_size=10)
    for item in tempIds:
        checkNeedRemove(item['review_id'], item['text'], item['date'])
        bar.update(1)
    tempIds.close()
    bar.close()
    
def checkNeedRemove(id, text, date):
    comparison_date = datetime(2017, 1, 1)
    dt = datetime.strptime(date, '%d/%m/%Y %H:%M:%S')
    if len(text) < 60 or dt < comparison_date:
        review_new.delete_one({'review_id': id})

getIds()

  return Cursor(self, *args, **kwargs)
Delete Review: 100%|██████████| 461171/461171 [3:10:51<00:00, 40.27it/s]  


In [2]:
### 4.根據新review表，建立user表
client = MongoClient('localhost', 27017)
db = client.Yelp_Original
user = db.user

db_new = client.Yelp_New
review_new = db_new.review
user_new = db_new.user

def getIds():
    userIds = list()
    tempIds = review_new.find({}, no_cursor_timeout=True, batch_size=10)
    for item in tempIds:
        userIds.append(item['user_id'])
    tempIds.close()
    return userIds

def checkReviewCount(id):
    count = review_new.count_documents({'user_id': id})
    if count >= 15 and count <= 200:
        insert_user(id)

def insert_user(id):
    condition=[
        {
            '$match':{
                'user_id': id
            }
        }
    ]
    for item in user.aggregate(condition):
        user_new.insert_one(item)

userIds = getIds()
bar = tqdm(total=len(np.unique(userIds)), desc='Insert User')
for id in np.unique(userIds):
    checkReviewCount(id)
    bar.update(1)
bar.close()

  return Cursor(self, *args, **kwargs)
Insert User: 100%|██████████| 181661/181661 [5:53:25<00:00,  8.57it/s]   


In [3]:
### 5.根據新user表，重新篩選review表
client = MongoClient('localhost', 27017)
db_new = client.Yelp_New
review_new = db_new.review
user_new = db_new.user

def getIds():
    bar = tqdm(total=review_new.count_documents({}), desc='Delete Review')
    tempIds = review_new.find({}, no_cursor_timeout=True, batch_size=10)
    for item in tempIds:
        checkNeedRemove(item['review_id'], item['user_id'])
        bar.update(1)
    tempIds.close()
    bar.close()

def checkNeedRemove(id, user_id):
    count = user_new.count_documents({'user_id': user_id})
    if count == 0:
        review_new.delete_one({'review_id': id})

getIds()

Delete Review: 100%|██████████| 411448/411448 [2:06:43<00:00, 54.12it/s]  


In [4]:
### 6.根據新review表，重新篩選business表
client = MongoClient('localhost', 27017)
db_new = client.Yelp_New
business_new = db_new.business
review_new = db_new.review

def getIds():
    bar = tqdm(total=business_new.count_documents({}), desc='Delete Business')
    tempIds = business_new.find({}, no_cursor_timeout=True, batch_size=10)
    for item in tempIds:
        checkNeedRemove(item['business_id'])
        bar.update(1)
    tempIds.close()
    bar.close()

def checkNeedRemove(id):
    count = review_new.count_documents({'business_id': id})
    if count == 0:
        business_new.delete_one({'business_id': id})

getIds()

Delete Business: 100%|██████████| 14783/14783 [08:52<00:00, 27.75it/s]


In [3]:
### 檢查review表中有幾個unique user_id與business_id
client = MongoClient('localhost', 27017)
db_new = client.Yelp_New
review_new = db_new.review

userIds = list()
businessIds = list()

def getIds():
    tempIds = review_new.find({}, no_cursor_timeout=True, batch_size=10)
    for item in tempIds:
        userIds.append(item['user_id'])
        businessIds.append(item['business_id'])
    tempIds.close()
    return userIds

userIds = getIds()
print('User: ' + str(len(np.unique(userIds))))
print('Business: ' + str(len(np.unique(businessIds))))

  return Cursor(self, *args, **kwargs)


User: 2533
Business: 12360
