In [1]:
# Get article information from database
# Get contents of each article from ptt
# Process contents into a list of sentences
# Read sentences based on "，", not " " or "\n"
# Break sentences in words
# Then remove stop words
# Pool all this sentences in a list
# Save processed word list into database

import re
import jieba
import sqlite3
import requests
import csv
from bs4 import BeautifulSoup


# Input an article id, generate a url for the article and return article content
def soup(a,b):
    articleid = a
    name = b
    print("Run soup, get article content from:",a)
    # Send http request to server
    text_list=[]
    url = "https://www.ptt.cc/bbs/{name}/".format(name=name)
    url= url+articleid
    response = requests.get(url, cookies={'over18': '1'})
    result = response.text
    soup = BeautifulSoup(result, 'html.parser')
    soup = soup.find(id="main-content")
    if soup != None:
        for span in soup('span'):
            span.decompose()
        for div in soup('div'):
            div.decompose()
        for a in soup('a'):
            a.decompose()
        text_list.append(soup.text)
    return(text_list)


def sentence_check(a):
    # The faster way is creating a list, then checking any unwanted characters before replacing it.
    # However, this way produces multiple output, which increase some complexity
    print("run sentence check")
    input_list=a
    temp_list=[]
    return_list=[]
    pattern = r',|\.|/|;|\'|`|\[|\]|<|>|\?|:|"|\{|\}|\~|!|@|#|\$|%|\^|&|\(|\)|-|=|\_|\+|，|。|、|；|‘|’|【|】|·|！| |…|（|）'

    for element in input_list:
        element = re.split(pattern,element)
        for t in element:
            if t != '':
                temp_list.append(t)

    # Remove \n in list
    temp_list = [x for x in temp_list if x != '\n']
    for item in temp_list:
        item = item.replace("\n","")
        return_list.append(item)
    return(return_list)


# Read sentences of an article and process them as a bag of words
# Use jieba module to recognize chinese words
# Need a dictionary of common tradition Chinese words
# Need a dictionary of stop words
# Return a list of lists of words (still keep the sentence information)
def word(a):
    print("Run word, powered by JieBa")
    input_list = a
    jieba.set_dictionary('dict.txt')
    stops = open('stop.txt', 'r', encoding='utf8')
    stopword = stops.read().split('\n')
    stops.close()

    # Update stop words
    stoplist=['\n',' ','']
    for element in stoplist:
        stopword.append(element)

    # Remove any stop words from list
    new_list=[]
    i = 0
    while i < len(input_list):
        element = input_list[i]
        new_list.append([t for t in jieba.cut(element) if t not in stopword])
        i+=1

    # Return a bag of words
    return_list=[]
    j = 0
    while j < len(new_list):
        return_list += new_list[j]
        j += 1
    return(return_list)


# Read a list of words and pool all words in a string
# Separate each word by ','
# Return a string
def pool(a):
    print("Run Pool")
    input_list = a

    return_string = ','.join(input_list)
    return(return_string)


# Update database
def update_database(a,b):
    print("Update database")
    articleid = a
    articlecontent = b
    conn = sqlite3.connect('ptt.db')
    c = conn.cursor()
    c.execute("UPDATE article_large SET content = ? where id=?",(articlecontent,articleid))
    conn.commit()
    print("Update database done!")


# Enter the subforum which you want to scrape data from
# Subforum name is case sensitive
def main():
    conn = sqlite3.connect('ptt.db')
    c = conn.cursor()
    cursor = c.execute("SELECT id from article_large WHERE content IS NULL ")
    articleid_list=[]
    for row in cursor:
        articleid_list.append(row[0])
    cursor.close()
    print("Total number of article:",len(articleid_list))

    name = input("Enter which sub forum you want to scrape data from: ")
    print("get data from {a}".format(a = name))

    i = 0
    while i < len(articleid_list):
        page_content = soup(articleid_list[i], name)

        if page_content != '':
            # Get clean sentence from each article
            sentence_list = sentence_check(page_content)

            # Get a bag of words from each article
            word_list=word(sentence_list)

            # Pool all word in a string
            word_pool = pool(word_list)

            # Update database
            update_database(articleid_list[i],word_pool)
            i+=1
            k = i / len(articleid_list)
            if k < 0.1 and k < 0.3:
                print("Progress <10%")
            if k > 0.3 and k < 0.5:
                print("Progress < 30%")
            if k < 0.5 and k > 0.7:
                print("Progress < 30%")
            if k > 0.7:
                print("Progress < 70%")
        print("Progress done")


if __name__ == "__main__":
    print("initialize")
    main()

initialize
Total number of article: 2
Enter which sub forum you want to scrape data from: Japan_Travel
get data from Japan_Travel
Run soup, get article content from: M.1552791579.A.86D.html
run sentence check
Run word, powered by JieBa

Building prefix dict from /Users/hsieh/Documents/ptt/dict.txt ...
Loading model from cache /var/folders/54/w85qh3vd4xv5v6qz8w4t70940000gn/T/jieba.ubaa545e936adb4bd7ef290058a1bf422.cache
Loading model cost 0.747 seconds.
Prefix dict has been built succesfully.
Building prefix dict from /Users/hsieh/Documents/ptt/dict.txt ...
Loading model from cache /var/folders/54/w85qh3vd4xv5v6qz8w4t70940000gn/T/jieba.ubaa545e936adb4bd7ef290058a1bf422.cache



Run Pool
Update database
Update database done!
Progress < 70%
Run soup, get article content from: M.1552742037.A.E2D.html
run sentence check
Run word, powered by JieBa


Loading model cost 0.736 seconds.
Prefix dict has been built succesfully.


Run Pool
Update database
Update database done!
Progress done
