In [1]:
import urllib.request as urllib2 
from bs4 import BeautifulSoup
import re
import requests
from datetime import datetime
import pandas as pd

In [2]:
headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'referrer': 'https://google.com',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
    'Pragma': 'no-cache'}

In [3]:
def get_thread_links(url):
    '''Retrieves all the thread links in ONE page of the forum'''
    links = []
    scrapping = requests.get(url, headers=headers)
    soup = BeautifulSoup(scrapping.text, 'html.parser')
    html_links = soup.findAll('a',attrs={'class': 'title'})
    links = [x.get('href') for x in html_links]
    return links

In [26]:
def get_all_thread_links(forum_link, start_page, end_page):
    '''Extracts all thread links from the A NUMBER of pages of the forum'''
    scrapped_links = []
    url = forum_link

    for i in range(start_page, end_page):
        url += 'index'+ str(i)+ '.html'
        thread_links = get_thread_links(url)
        scrapped_links.extend(thread_links)
        url = forum_link
        
        
    return scrapped_links

In [28]:
def get_mbti(string):
    
    '''Retrieves MBTI tag from each post. Posts that do 
    not have MBTI tag will return 0'''
    
    index = string.find('MBTI')
    
    if index == -1:
        return 0
    else:
        mbti = string[index+5:index+5+4]
        return mbti

In [29]:
def get_one_page_comments(link):
    page_comments = []
    page_mbti = []
    
    request = requests.get(link,headers=headers)
    soup = BeautifulSoup(request.text, 'html.parser')
        
    block_comment = soup.find_all('div', attrs={'class': 'postdetails'})
    
    for block in block_comment:
        user_info = block.find('dl', attrs={'class': 'userinfo_extra'})
        comment = block.find('blockquote', attrs={'class': 'postcontent restore'})

        if comment == None: #extracted some other parts of the webpage that is not the posts
            continue

        # retrieve mbti tag
        if user_info != None:
            user_info_text = user_info.get_text()
            mbti = get_mbti(user_info_text)
        else:
            mbti = 0
            
        page_mbti.append(mbti)

        # retrieved the comment from post with reply box
        bbcode = block.find('div', attrs={'class': 'bbcode_container'})
        if bbcode != None:
            acc_comments = ''
            while bbcode != None:
                bbcode = bbcode.next_sibling
                if str(bbcode) == '<br/>' or bbcode == None:
                    continue
                elif bbcode.name != None:
                    continue
                else:
                    acc_comments += bbcode.strip()
                    
            page_comments.append(acc_comments)

        else: # comment does not come with reply box
            comment = comment.get_text().strip()
            page_comments.append(comment)
    
    return page_mbti, page_comments

In [30]:
def get_subpages(link): 
    ''' Finds out what is the last page of one thread link
    and the index to increase the page number'''
    request = requests.get(link,headers=headers)
    soup = BeautifulSoup(request.text, 'html.parser')
    
    page_navigator = soup.find_all('div', attrs={'class': 'paginationList'})
    
    for tag in page_navigator:
        link_tag = tag.find_all("a")

        if len(link_tag) != 0:
            last_link = link_tag[-1].get('href')
            html_index = last_link.find('.html')
            last_hyphen_index = last_link.find('-',html_index - 4,html_index)
            last_page = last_link[last_hyphen_index+1:html_index]
            
            #check if last page retrieved is only a number. If not retrieve number again
            inc = 1
            while last_page.count('-') != 0:
                last_hyphen_index = last_link.find('-',html_index + inc - 4,html_index)
                last_page = last_link[last_hyphen_index+1:html_index]
                inc += 1

            return int(last_page)
        
        else:
            html_index = link.find('.html')
            return 1

        break

In [31]:
def get_all_mbti_comments(all_thread_links):
    personality = []
    post = []
    for i in range(len(all_thread_links)):
        link = all_thread_links[i]
        request = requests.get(link,headers=headers)
        soup = BeautifulSoup(request.text, 'html.parser')
        
        # within thread link, get the number of pages and the index to change the link
        num_subpages = get_subpages(link)
        html_index = link.find('.html')
        
        # get comments from all the subpages
        for i in range(num_subpages):
            subpage_link = link[:html_index] + '-' + str(i) + link[html_index:]
            mbti, comments = get_one_page_comments(subpage_link)
            
            personality.extend(mbti)
            post.extend(comments)
        
    return personality, post

In [50]:
forum_link = 'https://www.typologycentral.com/forums/forum22/'
scrapped_links = get_all_thread_links(forum_link, 1, 10)

In [None]:
mbti, comment = get_all_mbti_comments(scrapped_links)

In [None]:
len(mbti)

In [None]:
len(comment)

In [48]:
data = {"MBTI": mbti, "Post": comment}
df = pd.DataFrame(data)

In [49]:
df.to_csv("Forum23_1_20.csv")