In [None]:
import os
import re
import sys
#sys.path.append("..")

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
proj_path = os.path.dirname(os.path.realpath('url_extraction.ipynb'))
data_path = os.path.join(proj_path, 'data/')
results_path = os.path.join(proj_path, 'results/')
model_path = os.path.join(proj_path, 'models/')

train_path=os.path.join(data_path, 'train.csv')
test_path=os.path.join(data_path, 'test.csv')
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
train_labels = train_data['category'].tolist()

In [None]:
#train_data = train_data.head(10)
#test_data = test_data.head(10)

In [None]:
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
url_text = list()
url_status = list()
total_rows = len(train_data.index)
p_count = 0
e_count = 0

for row in train_data.iterrows():
    try:
        page = requests.get(row[1]['url'], headers=headers)
        pagetext = page.text
        pagecode = page.status_code
    except:
        pagetext = ""
        pagecode = 1
    p_count +=1
    print ('{} of {} Processed: '.format(row[0]+1,total_rows) + row[1]['url'])

    soup = BeautifulSoup(pagetext, 'html.parser')
    for script in soup(["script", "style"]):
        script.decompose()
    raw_text = soup.get_text()
    url_text.append(raw_text)
    url_status.append(pagecode)
    

train_data['url_status'] = url_status    
train_data['url_text'] = url_text

train_data.head(10)

In [None]:
url_text = list()
url_status = list()
total_rows = len(test_data.index)
p_count = 0
e_count = 0

for row in test_data.iterrows():
    try:
        page = requests.get(row[1]['url'], headers=headers)
        pagetext = page.text
        pagecode = page.status_code
    except:
        pagetext = ""
        pagecode = 1
    p_count +=1
    print ('{} of {} Processed: '.format(row[0]+1,total_rows) + row[1]['url'])

    soup = BeautifulSoup(pagetext, 'html.parser')
    for script in soup(["script", "style"]):
        script.decompose()
    raw_text = soup.get_text()
    url_text.append(raw_text)
    url_status.append(pagecode)
    

test_data['url_status'] = url_status     
test_data['url_text'] = url_text

test_data.head(10)

In [None]:
import nltk
import time
import string

In [None]:
#Process Training Data

total_rows = len(train_data.index)

processed_titles = list()
processed_texts = list()
year_of_publish = list()
for row in train_data.iterrows():
    
    #process title
    porter = nltk.PorterStemmer()
    tokens = nltk.wordpunct_tokenize(row[1]['title'])
    stop = nltk.corpus.stopwords.words('english') + list(string.punctuation)
    tokens = [i.lower() for i in tokens if i not in stop]
    tokens = [porter.stem(t) for t in tokens]
    text = nltk.Text(tokens)
    processed_title = ' '.join(tokens)
    processed_titles.append(processed_title)
    
    #process text
    porter = nltk.PorterStemmer()
    tokens = nltk.wordpunct_tokenize(row[1]['url_text'])
    stop = nltk.corpus.stopwords.words('english') + list(string.punctuation)
    tokens = [i.lower() for i in tokens if i not in stop]
    tokens = [porter.stem(t) for t in tokens]
    text = nltk.Text(tokens)
    if row[1]['url_status'] == 200:
        processed_text = ' '.join(tokens)
    else:
        processed_text = ''
    processed_texts.append(processed_text)
    
    #process year
    datetime = time.gmtime(row[1]['timestamp']/1000.)
    year = datetime.tm_year
    year_of_publish.append(year)
    
    #print (processed_text)
    #print ("___")
    #words = [w.lower() for w in text]
    #vocab = sorted(set(words))
    
    print ('{} of {} Processed: '.format(row[0]+1,total_rows) + row[1]['url'])

train_data['processed_titles'] = processed_titles
train_data['processed_texts'] = processed_texts
train_data['year_of_publish'] = year_of_publish

col_names = ['article_id','publisher','year_of_publish','processed_titles','url_status','processed_texts','category']
train_data = train_data[col_names]
train_data.to_csv('data/train_url_extracted_and_processed.csv')
train_data.head()

In [None]:
#Process Test Data

total_rows = len(test_data.index)

processed_titles = list()
processed_texts = list()
year_of_publish = list()
for row in test_data.iterrows():
    
    #process title
    porter = nltk.PorterStemmer()
    tokens = nltk.wordpunct_tokenize(row[1]['title'])
    stop = nltk.corpus.stopwords.words('english') + list(string.punctuation)
    tokens = [i.lower() for i in tokens if i not in stop]
    tokens = [porter.stem(t) for t in tokens]
    text = nltk.Text(tokens)
    processed_title = ' '.join(tokens)
    processed_titles.append(processed_title)
    
    #process text
    porter = nltk.PorterStemmer()
    tokens = nltk.wordpunct_tokenize(row[1]['url_text'])
    stop = nltk.corpus.stopwords.words('english') + list(string.punctuation)
    tokens = [i.lower() for i in tokens if i not in stop]
    tokens = [porter.stem(t) for t in tokens]
    text = nltk.Text(tokens)
    if row[1]['url_status'] == 200:
        processed_text = ' '.join(tokens)
    else:
        processed_text = ''
    processed_texts.append(processed_text)
    
    #process year
    datetime = time.gmtime(row[1]['timestamp']/1000.)
    year = datetime.tm_year
    year_of_publish.append(year)
    
    print ('{} of {} Processed: '.format(row[0]+1,total_rows) + row[1]['url'])
    
    #print (processed_text)
    #print ("___")
    #words = [w.lower() for w in text]
    #vocab = sorted(set(words))

test_data['processed_titles'] = processed_titles
test_data['processed_texts'] = processed_texts
test_data['year_of_publish'] = year_of_publish

col_names = ['article_id','publisher','year_of_publish','processed_titles','url_status','processed_texts']
test_data = test_data[col_names]
test_data.to_csv('data/test_url_extracted_and_processed.csv')
test_data.head()