In [1]:
import numpy as np
import pandas as pd
import os, itertools, csv
from bs4 import BeautifulSoup
import re
from pytrends.request import TrendReq
from datetime import *
import time

In [2]:
df = pd.read_csv(
    "./data/train.csv")

df.head()

Unnamed: 0,Id,Popularity,Page content
0,0,-1,"<html><head><div class=""article-info""> <span c..."
1,1,1,"<html><head><div class=""article-info""><span cl..."
2,2,1,"<html><head><div class=""article-info""><span cl..."
3,3,-1,"<html><head><div class=""article-info""><span cl..."
4,4,-1,"<html><head><div class=""article-info""><span cl..."


In [19]:
def get_title(text: str) -> str:    
    soup = BeautifulSoup(text, "html.parser")
    title = soup.find('h1',{'class':'title'})
    return title.get_text()
# test
print(get_title(df['Page content'][0]))

NASA's Grand Challenge: Stop Asteroids From Destroying Earth


In [4]:
def get_category(text: str) -> list:
    res = list()
    soup = BeautifulSoup(text, "html.parser")
    cats = soup.find_all('a', {'href': re.compile('/category/*')})
    for cat in cats:
        res.append(cat.get_text().lower())
    return res
        
# test
print(get_category(df['Page content'][2000]))

['climate', 'u.s.', 'world', 'warmest year']


In [5]:
def get_date(text: str) -> datetime:
    soup = BeautifulSoup(text)
    selector = "time"
    date = [i.text for i in soup.select(selector)][0]
    date = date.split()
    date = str(date[0] + ' ' + date[1])

    d = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
    return d

# test
print(str(get_date(df['Page content'][0]) + timedelta(days=1)).split(' '))

['2013-06-20', '15:04:30']


In [31]:
def preprocessor(text: str) -> str:
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

# test
print(preprocessor(df['Page content'][0]))

 clara moskowitz for space com 2013 06 19 15 04 30 utc nasa s grand challenge stop asteroids from destroying earth there may be killer asteroids headed for earth and nasa has decided to do something about it the space agency announced a new grand challenge on june 18 to find all dangerous space rocks and figure out how to stop them from destroying our planet the new mission builds on projects already underway at nasa including a plan to capture an asteroid pull it in toward the moon and send astronauts to visit it as part of the grand challenge the agency issued a request for information today aiming to solicit ideas from industry academia and the public on how to improve the asteroid mission plan we re asking for you to think about concepts and different approaches for what we ve described here william gerstenmaier nasa s associate administrator for human explorations and operations said yesterday during a nasa event announcing the initiative we want you to think about other ways of e

In [33]:
import string
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

stop = nltk.corpus.stopwords.words('english')

def tokenizer_stem(text: str) -> list:
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    tokens = tokenizer.tokenize(text)
    
#     stemmer = nltk.stem.PorterStemmer()
#     " ".join(stemmer.stem(token) for token in tokens)
    
    stemmer = nltk.stem.WordNetLemmatizer()
    # text =  " ".join(stemmer.lemmatize(token) for token in tokens)
    
    return [stemmer.lemmatize(token) for token in tokens if token not in stop]

# test
print(tokenizer_stem(get_title(df['Page content'][0]).lower()))

['nasa', "'s", 'grand', 'challenge', ':', 'stop', 'asteroid', 'destroying', 'earth']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yuwei\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yuwei\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [36]:
import numpy as np
import scipy as sp
from sklearn.feature_extraction.text import CountVectorizer

doc_dummy = ["Study hard, then you will be happy and I will be happy", 
           "\"I'm not happy :(\" \", because you don't study hard"]
print('[example documents]\n{}\n'.format('\n'.join(doc_dummy)))

# ngram_range=(min,max), default: 1-gram => (1,1)
count = CountVectorizer(ngram_range=(1, 2),
                        preprocessor=preprocessor,
                        tokenizer=tokenizer_stem)

count.fit(doc_dummy)
# dictionary is stored in vocabulary_
BoW = count.vocabulary_
print('[vocabulary]\n{}'.format(BoW))

# get matrix (doc_id, vocabulary_id) --> tf
doc_bag = count.transform(doc_dummy)
print('(did, vid)\ttf')
print(doc_bag)

print('\nIs document-term matrix a scipy.sparse matrix? {}'.format(sp.sparse.issparse(doc_bag)))

doc_bag = doc_bag.toarray()
print(doc_bag)

print('\nAfter calling .toarray(), is it a scipy.sparse matrix? {}'.format(sp.sparse.issparse(doc_bag)))

[example documents]
Study hard, then you will be happy and I will be happy
"I'm not happy :(" ", because you don't study hard

[vocabulary]
{'study': 9, 'hard': 6, 'happy': 3, 'study hard': 10, 'hard happy': 8, 'happy happy': 4, ':': 1, '(': 0, 'happy study': 5, 'hard :': 7, ': (': 2}
(did, vid)	tf
  (0, 3)	2
  (0, 4)	1
  (0, 6)	1
  (0, 8)	1
  (0, 9)	1
  (0, 10)	1
  (1, 0)	1
  (1, 1)	1
  (1, 2)	1
  (1, 3)	1
  (1, 5)	1
  (1, 6)	1
  (1, 7)	1
  (1, 9)	1
  (1, 10)	1

Is document-term matrix a scipy.sparse matrix? True
[[0 0 0 2 1 0 1 0 1 1 1]
 [1 1 1 1 0 1 1 1 0 1 1]]

After calling .toarray(), is it a scipy.sparse matrix? False


# Google Trend

In [15]:
proxy_list = ['http://50.116.3.101:3128', 'http://157.245.249.43:8080', 'http://172.106.18.152:8080', 'http://209.90.63.108:80']

def get_trend(word: list, t_start: str, t_end: str) -> list:
    g_trends = TrendReq(tz=360, proxies=proxy_list)
    g_trends.build_payload(word, timeframe = t_start+' '+t_end, geo='', gprop='')
    trend = g_trends.interest_over_time()
    if trend.size > 0:
        #trend.columns = ['0', '1', '2']
        #trend = trend.columns.get_loc()
        print(trend.head())
        return trend.iloc[:, :]
    else: 
        print(word+" request fail.")
        return []

# test
trends = get_trend(['NFV', 'SDN', "google", "ocean", "car"], '2019-06-20', '2019-07-20')
print(trends)
# print(np.mean(get_trend('NFV', '2013-06-20', '2013-07-20'), axis=0))
# print(np.std(get_trend('NFV', '2013-06-20', '2013-07-20'), axis=0, ddof=1))
# print(np.std([1, 2, 3], ddof=1))

            NFV  SDN  google  ocean  car  isPartial
date                                               
2019-06-20    0    0      98      3   23      False
2019-06-21    0    0      96      3   24      False
2019-06-22    0    0      83      3   25      False
2019-06-23    0    0      84      3   25      False
2019-06-24    0    0      98      3   23      False
            NFV  SDN  google  ocean  car  isPartial
date                                               
2019-06-20    0    0      98      3   23      False
2019-06-21    0    0      96      3   24      False
2019-06-22    0    0      83      3   25      False
2019-06-23    0    0      84      3   25      False
2019-06-24    0    0      98      3   23      False
2019-06-25    0    1     100      3   23      False
2019-06-26    0    0      98      3   23      False
2019-06-27    0    0      95      3   23      False
2019-06-28    0    0      93      3   24      False
2019-06-29    0    0      84      3   25      False
2019-06-30  

In [28]:
t = trends.iloc[:28, :]
mid = int(t.shape[0]/2)
print(mid)
print(t.iloc[:mid, 2])
print(t.iloc[mid:, 2])
print(t)

14
date
2019-06-20     98
2019-06-21     96
2019-06-22     83
2019-06-23     84
2019-06-24     98
2019-06-25    100
2019-06-26     98
2019-06-27     95
2019-06-28     93
2019-06-29     84
2019-06-30     82
2019-07-01     96
2019-07-02     97
2019-07-03     97
Name: google, dtype: int32
date
2019-07-04    92
2019-07-05    92
2019-07-06    83
2019-07-07    81
2019-07-08    98
2019-07-09    97
2019-07-10    96
2019-07-11    94
2019-07-12    95
2019-07-13    84
2019-07-14    81
2019-07-15    96
2019-07-16    98
2019-07-17    99
Name: google, dtype: int32
            NFV  SDN  google  ocean  car  isPartial
date                                               
2019-06-20    0    0      98      3   23      False
2019-06-21    0    0      96      3   24      False
2019-06-22    0    0      83      3   25      False
2019-06-23    0    0      84      3   25      False
2019-06-24    0    0      98      3   23      False
2019-06-25    0    1     100      3   23      False
2019-06-26    0    0      9

In [None]:
def is_trend(word: str, pub_time: datetime) -> bool:
    base_line = 0
    
    y_start = str(pub_time + timedelta(days=-180)).split(' ')[0]
    y_end = str(pub_time + timedelta(days=180)).split(' ')[0]
    y_trend = get_trend(word, y_start, y_end)
#     print(y_start)
#     print(pub_time)
#     print(y_end)
    if len(y_trend) > 0:
        y_mean = np.mean(y_trend)
        y_std = np.std(y_trend, ddof=1)

        # cal base line
        if y_mean+y_std*2 > 100:
            base_line = 100
        else: base_line = y_mean+y_std*2

        for i in range(0, 2):
            if y_trend[25+i] >= base_line: return True
    return False

# Test
cats = get_category(df['Page content'][1])
print(cats)
for cat in cats:
    if is_trend(cat, get_date(df['Page content'][1])):
        print("POP")
        break

In [None]:
print(np.array(df['Popularity'].values[:2]))

In [None]:
from sklearn.metrics import accuracy_score

res = []

for i in range(100):
    print('Step: '+ str(i))
    cats = get_category(df['Page content'][i])
    flag = True
    for cat in cats:
        if is_trend(cat, get_date(df['Page content'][i])):
            res.append(1)
            flag = False
            print('1')
            break
    if flag:
        res.append(-1)
        print('-1')
print('Done')
pred = np.array(res)
print(pred)
print('Accuracy: %.2f' % accuracy_score(pred, np.array(df['Popularity'].values[:100])))


In [None]:
from sklearn.metrics import auc
res = []

for i in range(20):
    print('Step: '+ str(i))
    cats = get_title(df['Page content'][i]).split(' ')
    print(cats)
    flag = True
    for cat in cats:
        if is_trend(cat, get_date(df['Page content'][i])):
            res.append(1)
            flag = False
            break
    if flag:
        res.append(-1)
print('Done')
pred = np.array(res)
print(pred)
print('Accuracy: %.2f' % accuracy_score(pred, np.array(df['Popularity'].values[:20])))
# print('AUC: %.2f' % auc(pred, np.array(df['Popularity'].values[:20])))