In [3]:
from bs4 import BeautifulSoup
import re 
import urllib
import numpy as np
import time
import requests
import nltk 
import string
import pandas as pd
from nltk.corpus import stopwords
import heapq
import csv

In [4]:
#Helper Functions
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
headers={'User-Agent':user_agent,} 

STOPWORDS = set(stopwords.words('english'))

#uses webdriver object to execute javascript code and get dynamically loaded webcontent
def get_js_soup(url,driver):
    driver.get(url)
    res_html = driver.execute_script('return document.body.innerHTML')
    soup = BeautifulSoup(res_html,'html.parser') #beautiful soup object to be used for parsing html content
    return soup

def GetDataFromURL(url_link):
    time.sleep(0.01)
    response = requests.get(url_link)
    soup = BeautifulSoup(response.text, "html.parser")
    remove_script(soup)
    text = soup.get_text()
    preprocessed_text = text
    #preprocessed_text = preprocess_text(text)
    return preprocessed_text

#Checks if bio_url is a valid faculty homepage
def is_valid_url(url_check):
    ret_url = 'NA'
    if url_check == 'NA':
        return ret_url
    if url_check.endswith('.pdf'): #we're not parsing pdfs
        return ret_url
    try:
        #sometimes the homepage url points to the same page as the faculty profile page
        #which should be treated differently from an actual homepage
        request= urllib.request.Request(url_check,None,headers)
        ret_url = urllib.request.urlopen(request).geturl() 
    except:
        return ret_url      #unable to access bio_url
    return ret_url


def remove_script(soup):
    for script in soup(["script", "style"]):
        script.decompose()
    return soup

def scrapeURL(iStart,iEnd):
    for i in range(iStart,iEnd):
        url_check = is_valid_url(urls[i])
        print(f"Iteration: {i} url: {urls[i]}")
        if url_check != 'NA':
            ProcessedText[i] = GetDataFromURL(url_check)
        else:
            ProcessedText[i] = "NA"



def preprocess_text(ExtractedText):
    ExtractedText = " ".join((re.sub(r'[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+', "EmailAddress", ExtractedText)).split())
    ExtractedText = " ".join((re.sub(r'^https?:\/\/.*[\r\n]*',"WebAddress", ExtractedText)).split())
    ExtractedText = ExtractedText.encode('ascii',errors='ignore').decode('utf-8')       #removes non-ascii characters
    ExtractedText = re.sub('\s+',' ',ExtractedText)       #repalces repeated whitespace characters with single space
    ExtractedText = re.sub(r'\W',' ',ExtractedText) 
    ExtractedText = ExtractedText.replace("\n"," ")
    
    ExtractedText = ExtractedText.lower()
    ExtractedText = ' '.join(word for word in ExtractedText.split() if word not in STOPWORDS) # delete stopwors from text
    return ExtractedText

In [5]:
#Load DataSheet
data = pd.read_csv("LoadSheet.csv", encoding = "latin1")
data.isna().sum()

Faculty Directory Homepage    6
FacultyPage                   0
Label                         0
dtype: int64

In [6]:
#Replace nulls with NA
data.replace(np.nan, "NA", inplace=True)

In [7]:
data.head()

Unnamed: 0,Faculty Directory Homepage,FacultyPage,Label
0,https://www.csd.cs.cmu.edu/directory/faculty,1,FacultyDirectoryPage
1,https://www.cs.stanford.edu/directory/faculty,1,FacultyDirectoryPage
2,https://cs.uic.edu/faculty-staff/faculty/,1,FacultyDirectoryPage
3,https://www.cs.uchicago.edu/people/faculty/,1,FacultyDirectoryPage
4,https://cs.vt.edu/People/Faculty.html,1,FacultyDirectoryPage


In [8]:
#Prepare for web scraping
urls = []
for index, row in data.iterrows():
    urls.append(row["Faculty Directory Homepage"])
ProcessedText = ['a']*len(urls)

In [9]:
len(ProcessedText)

2815

In [10]:
#Scraping Function
scrapeURL(0,len(ProcessedText))

Iteration: 0 url: https://www.csd.cs.cmu.edu/directory/faculty
Iteration: 1 url: https://www.cs.stanford.edu/directory/faculty
Iteration: 2 url: https://cs.uic.edu/faculty-staff/faculty/


KeyboardInterrupt: 

In [11]:
#add scraped data to datafram
data.insert(3, 'extractedText', ProcessedText)

In [17]:
#Check n remove nulls, remove values having less than 10 characters
data.isna().sum()

Faculty Directory Homepage      0
FacultyPage                     0
Label                           0
extractedText                 301
dtype: int64

In [18]:
data = data.dropna()

In [20]:
data = data[data['extractedText'].str.len()>= 10]

In [21]:
#Save to harddisk
data.to_csv('extracted_data_unprocessed_latest.csv')

In [24]:
#optional
data = pd.read_csv("extracted_data_unprocessed_latest.csv",index_col=[0])
data
data.reset_index(inplace=True)

In [26]:
# apply preprocessing
for i in range(len(data["extractedText"])):
    data["extractedText"][i] = preprocess_text(data["extractedText"][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [28]:
data.head()

Unnamed: 0,index,Faculty Directory Homepage,FacultyPage,Label,extractedText
0,0,https://www.csd.cs.cmu.edu/directory/faculty,1,FacultyDirectoryPage,csd faculty carnegie mellon university compute...
1,1,https://www.cs.stanford.edu/directory/faculty,1,FacultyDirectoryPage,faculty stanford computer science skip skip co...
2,2,https://cs.uic.edu/faculty-staff/faculty/,1,FacultyDirectoryPage,faculty computer science university illinois c...
3,3,https://www.cs.uchicago.edu/people/faculty/,1,FacultyDirectoryPage,computer science university chicago department...
4,5,https://www.cs.purdue.edu/people/faculty/index...,1,FacultyDirectoryPage,purdue university department computer science ...


In [33]:
#save to hard disk
data.to_csv('extracted_data_processed.csv')