In [1]:
import pandas as pd
import numpy as np
from numpy import random
import nltk
nltk.download('wordnet')
import urllib
import requests
import time
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report

[nltk_data] Downloading package wordnet to /Users/sameen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#Helper functions
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
headers={'User-Agent':user_agent,} 

STOPWORDS = set(stopwords.words('english'))
ProcessedText = None

def GetDataFromURL(url_link):
    time.sleep(0.01)
    response = requests.get(url_link)
    soup = BeautifulSoup(response.text, "html.parser")
    remove_script(soup)
    text = soup.get_text()
    preprocessed_text = text
    #preprocessed_text = preprocess_text(text)
    return preprocessed_text

#Checks if bio_url is a valid faculty homepage
def is_valid_url(url_check):
    ret_url = 'NA'
    if url_check == 'NA':
        return ret_url
    if url_check.endswith('.pdf'): #we're not parsing pdfs
        return ret_url
    try:
        #sometimes the homepage url points to the same page as the faculty profile page
        #which should be treated differently from an actual homepage
        request=urllib.request.Request(url_check,None,headers)
        ret_url = urllib.request.urlopen(request).geturl() 
    except:
        return ret_url      #unable to access bio_url
    return ret_url


def remove_script(soup):
    for script in soup(["script", "style"]):
        script.decompose()
    return soup

def scrapeURL(url_in):
    url_check = is_valid_url(url_in)
    if url_check != 'NA':
        ProcessedText = GetDataFromURL(url_check)
    else:
        ProcessedText = "NA"
    return ProcessedText



def preprocess_text(ExtractedText):
    ExtractedText = " ".join((re.sub(r'[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+', "EmailAddress", ExtractedText)).split())
    ExtractedText = " ".join((re.sub(r'^https?:\/\/.*[\r\n]*',"WebAddress", ExtractedText)).split())
    ExtractedText = ExtractedText.encode('ascii',errors='ignore').decode('utf-8')       #removes non-ascii characters
    ExtractedText = re.sub('\s+',' ',ExtractedText)       #repalces repeated whitespace characters with single space
    ExtractedText = re.sub(r'\W',' ',ExtractedText) 
    ExtractedText = ExtractedText.replace("\n"," ")
    ExtractedText = ExtractedText.lower()
    ExtractedText = ' '.join(word for word in ExtractedText.split() if word not in STOPWORDS) # delete stopwors from text
    return ExtractedText

In [3]:
# faculty bios data provided in MP2.3
f = open("bios.txt", "r")
content = f.read()
extractedText = content.splitlines()
f.close()


In [4]:
for i in range(len(extractedText)):
    extractedText[i] = preprocess_text(extractedText[i])

In [5]:
Label = np.full(len(extractedText), "FacultyPage")

In [6]:
# test inputs
url = "https://docs.python.org/3/library/multiprocessing.html"
url1 = "https://cs.illinois.edu/about/people/department-faculty"
url2 = "https://cs.illinois.edu/about/people/department-faculty/sadve"

ProcessedText = scrapeURL(url)
ProcessedText1 = scrapeURL(url1)
ProcessedText2 = scrapeURL(url2)

ProcessedText = preprocess_text(ProcessedText)
ProcessedText1 = preprocess_text(ProcessedText1)
ProcessedText2 = preprocess_text(ProcessedText2)

ProcessedText = [ProcessedText]
ProcessedText1 = [ProcessedText1]
ProcessedText2 = [ProcessedText2]

In [7]:
df = pd.DataFrame(dict(Label=Label,extractedText=extractedText))
df.head()

Unnamed: 0,Label,extractedText
0,FacultyPage,govind p agrawal james c wyant professor optic...
1,FacultyPage,segev benzvi associate professor physics phd 4...
2,FacultyPage,dan bergstralh assistant professor biology phy...
3,FacultyPage,riccardo betti robert l mccrory professor mech...
4,FacultyPage,nicholas p bigelow lee dubridge professor phys...


In [8]:
# load scraped data
data = pd.read_csv("extracted_data_processed.csv",index_col=[0])
data.isna().sum()

index                         0
Faculty Directory Homepage    0
FacultyPage                   0
Label                         0
extractedText                 3
dtype: int64

In [9]:
# Drop clutter
data = data.dropna()
data = data.drop('Faculty Directory Homepage', 1)
data = data.drop('FacultyPage', 1)

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
data = data[data['Label'] != "FacultyPage"]

In [11]:
df1 = df[1:1000]

In [12]:
data = data.append(df1)

In [13]:
dataFacDir = data[data['Label'] != "FacultyPage"]
dataFac = data[data['Label'] != "FacultyDirectoryPage"]

In [14]:
#dataFacDir = dataFacDir.drop('level_0', 1)
dataFacDir = dataFacDir.drop('index', 1)
#dataFac = dataFac.drop('level_0', 1)
dataFac = dataFac.drop('index', 1)

  
  after removing the cwd from sys.path.


In [15]:
dataFacDir

Unnamed: 0,Label,extractedText
0,FacultyDirectoryPage,csd faculty carnegie mellon university compute...
1,FacultyDirectoryPage,faculty stanford computer science skip skip co...
2,FacultyDirectoryPage,faculty computer science university illinois c...
3,FacultyDirectoryPage,computer science university chicago department...
4,FacultyDirectoryPage,purdue university department computer science ...
...,...,...
2627,NonFacultyPage,dominion news grassroots jump content network ...
2628,NonFacultyPage,altweeklies com aan association alternative ne...
2629,NonFacultyPage,dominion stories http www dominionpaper ca all...
2630,NonFacultyPage,acceptable acceptable appropriate representati...


In [16]:
dataFac.reset_index(inplace=True)
dataFacDir.reset_index(inplace=True)

In [17]:
dataFac

Unnamed: 0,index,Label,extractedText
0,1756,NonFacultyPage,machine learning classify urls urls features s...
1,1757,NonFacultyPage,multi class text classification model comparis...
2,1758,NonFacultyPage,dallas news breaking news dfw texas world news...
3,1759,NonFacultyPage,creme 2 0 cis regulatory module explorer creme...
4,1760,NonFacultyPage,data software tools broad institute skip main ...
...,...,...,...
1867,995,FacultyPage,lawrence brown miers busch professor professor...
1868,996,FacultyPage,larry shepp professor statistics contact infor...
1869,997,FacultyPage,chemical biomolecular engineering assistant pr...
1870,998,FacultyPage,chemical biomolecular engineering associate pr...


In [18]:
dataFac = dataFac.drop('index', 1)
dataFacDir = dataFacDir.drop('index', 1)

  """Entry point for launching an IPython kernel.
  


In [19]:
#Prepare for classification
X = dataFac.extractedText
y = dataFac.Label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)
my_tags = ["NonFacultyPage","FacultyPage"]

In [20]:
# Prepare pipleine of variable for SGD Classifier to predit Faculty vs Non Faculty
sgdFac = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l1',alpha=1e-3, random_state=42, max_iter=10, tol=None)),
               ])
sgdFac.fit(X_train, y_train)

%time

y_pred = sgdFac.predict(X_test)

# Classification Report
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))
report = classification_report(y_test, y_pred ,target_names=my_tags, output_dict=True)
clf_rpt = pd.DataFrame(report).transpose()
clf_rpt.to_csv('clf_rptFac.csv')

CPU times: user 3 µs, sys: 7 µs, total: 10 µs
Wall time: 4.29 µs
accuracy 0.9733096085409253
                precision    recall  f1-score   support

NonFacultyPage       0.98      0.97      0.97       300
   FacultyPage       0.97      0.98      0.97       262

      accuracy                           0.97       562
     macro avg       0.97      0.97      0.97       562
  weighted avg       0.97      0.97      0.97       562



In [21]:
#test predictions
y_pred = sgdFac.predict(ProcessedText)
y_pred1 = sgdFac.predict(ProcessedText1)


In [22]:
y_pred, y_pred1

(array(['NonFacultyPage'], dtype='<U14'), array(['FacultyPage'], dtype='<U14'))

In [23]:
X = dataFacDir.extractedText
y = dataFacDir.Label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)
my_tags = ["NonFacultyPage","FacultyDirectoryPage"]

In [24]:
# Prepare pipleine of variable for SGD Classifier to predit Faculty directory vs Non Faculty
sgdFacDir = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l1',alpha=1e-3, random_state=42, max_iter=10, tol=None)),
               ])
sgdFacDir.fit(X_train, y_train)

%time

y_pred = sgdFacDir.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))
report = classification_report(y_test, y_pred ,target_names=my_tags, output_dict=True)
clf_rpt = pd.DataFrame(report).transpose()
clf_rpt.to_csv('clf_rptFacDir.csv')

CPU times: user 2 µs, sys: 7 µs, total: 9 µs
Wall time: 4.05 µs
accuracy 0.9592833876221498
                      precision    recall  f1-score   support

      NonFacultyPage       0.97      0.95      0.96       353
FacultyDirectoryPage       0.94      0.97      0.95       261

            accuracy                           0.96       614
           macro avg       0.96      0.96      0.96       614
        weighted avg       0.96      0.96      0.96       614



In [25]:
y_pred = sgdFac.predict(ProcessedText)
y_pred1 = sgdFac.predict(ProcessedText2)


In [26]:
y_pred, y_pred1

(array(['NonFacultyPage'], dtype='<U14'), array(['FacultyPage'], dtype='<U14'))

In [63]:
#Save models to harddisk
import pickle
with open('FacDir_classifier', 'wb') as picklefile:
    pickle.dump(sgdFacDir,picklefile)
with open('Fac_classifier', 'wb') as picklefile:
    pickle.dump(sgdFac,picklefile)