In [24]:
import torch
import numpy as np
import pandas as pd
import re
import pprint
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pp = pprint.PrettyPrinter(indent=2)

# Networking
import requests
from bs4 import BeautifulSoup

from threading import Thread
from typing import List

In [25]:
def orcas_to_raw_dataframe(path_file):
    """
    Reads the orcas dataset and returns a pandas dataframe.
    """
    f = open(path_file, 'r')
    lines = f.readlines()
    raw_df = pd.DataFrame(columns=['q_id', 'q', 'url_id', 'url'])
    for line in lines:
        re_gr = re.findall(r'(\d+)[\t ](.*)[\t ](D\d+)[\t ](.*)', line)[0]
        q_id, q, url_id, url = re_gr
        q.strip("\t ")
        url.strip("\t ")
        raw_df.loc[len(raw_df)] = [q_id, q, url_id, url]
    f.close()
    return raw_df


In [26]:
def orcas_to_agg_df(path):
    """
    Reads the orcas dataset and returns a pandas dataframe.
    """
    f = open(path, 'r')
    agg_df = pd.DataFrame(columns=['q_id', 'q', '[url_id, url]'])
    qid_urlid_dict = {}
    lines = f.readlines()
    for line in lines:
        re_gr = re.findall(r'(\d+)[\t ](.*)[\t ](D\d+)[\t ](.*)', line)[0]
        q_id, q, url_id, url = re_gr
        q.strip("\t ")
        url.strip("\t ")
        if (q_id, q) not in qid_urlid_dict:
            qid_urlid_dict[(q_id, q)] = [[url_id, url]]
        else:
            qid_urlid_dict[(q_id, q)].append([url_id, url])
    f.close()

    for key, value in qid_urlid_dict.items():
        agg_df.loc[len(agg_df)] = [key[0], key[1], value]
    # agg_df.set_index('q_id', inplace=True)
    return agg_df

In [27]:
orcas_path = "./orcas_subset.tsv"
orcas_df = orcas_to_agg_df(orcas_path)
orcas_df.head(10)
orcas_df["q_id"].nunique()

133

## Extract data from urls

In [28]:
class WebContentDownloader(Thread):
    def __init__(self, explore_list: List) -> None:
        Thread.__init__(self)
        self.explore_list = explore_list

    def run(self):
        for row in self.explore_list:
            content = ""
            for e in row['[url_id, url]']:
                url = e[1]
                try:
                    response = requests.get(url)
                    soup = BeautifulSoup(response.text, 'html.parser')
                    if response.status_code == 200:
                        soup = BeautifulSoup(response.text, 'html.parser')
                        for p in soup.find_all("p"):
                            content += p.text
                            content += "\n"

                    # Write content to file
                    f = open(f"./content/{row['q_id']}.txt", "w")
                    f.write(content)
                    f.close()
                except:
                    print("Error downloading: " + self.url)

In [29]:
for idx, row in orcas_df.iterrows():
    print(row['q'])
    content = ""
    for e in row['[url_id, url]']:
        url = e[1]
        r = requests.get(url)
        print(r.status_code)

        if r.status_code == 200:
            soup = BeautifulSoup(r.text, 'html.parser')
            for p in soup.find_all("p"):
                content += p.text
                content += "\n"
            
    # Write content to file
    f = open(f"./content/{row['q_id']}.txt", "w")
    f.write(content)
    f.close()

github
200
200
youtube
200
!
200
406
200
404
200
200
200
200
200
200
403
200
200
200
200
! c
200
! c++
200
! calculator
200
! definition
200
! excel
200
200
200
200
! icon
200
! in c
200
200
! in excel
200
200
200
200
200
200
! in excel formula
200
200
200
200
! in java
200
200
! in math
200
200
404
200
200
403
! in maths
403
200
403
! in python
200
200
200
! in statistics
200
200
! java
200
! javascript
200
! kc
200
! math
200
! math symbol
200
! meaning
200
200
! means
200
200
! spanish
200
! symbol
200
!! in javascript
200
200
!! javascript
200
!!! band
200
200
!!!lakshman
200
!!!m
200
!!'dio
200
!!-+- n
200
!!mm
200
!/rry
200
200
!0
200
!1
200
!5
451
!6
200
!888 poundpoundcentscents33
200
!=
200
403
200
!= arduino
200
!= c
200
200
!= c++
200
200
!= in c
200
200
!= in c++
200
200
!= in java
200
!= in python
200
200
200
!= java
200
200
!= javascript
200
!= meaning
200
!= meaning in python
200
200
!= null
200
!= operator
200
!= python
200
200
200
200
!= r
200
!= sql
200
!==
200
!== js