In [1]:
import numpy as np
import pandas as pd
import os, itertools, csv
from bs4 import BeautifulSoup
import re
from pytrends.request import TrendReq
from datetime import *
import time

# load the dataset
dataset = pd.read_csv('./data/train.csv')
dataset.head()

Unnamed: 0,Id,Popularity,Page content
0,0,-1,"<html><head><div class=""article-info""> <span c..."
1,1,1,"<html><head><div class=""article-info""><span cl..."
2,2,1,"<html><head><div class=""article-info""><span cl..."
3,3,-1,"<html><head><div class=""article-info""><span cl..."
4,4,-1,"<html><head><div class=""article-info""><span cl..."


In [2]:
dataset['content'] = dataset['Page content'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
# dataset[['Page content','content']].head()
dataset['word_count'] = dataset['content'].apply(lambda x: len(str(x).split(" ")))
dataset[['content','word_count']].head()

Unnamed: 0,content,word_count
0,Clara Moskowitz for Space.com 2013-06-19 15:0...,607
1,By Christina Warren2013-03-28 17:40:55 UTCGoog...,341
2,By Sam Laird2014-05-07 19:15:20 UTCBallin': 20...,1412
3,By Sam Laird2013-10-11 02:26:50 UTCCameraperso...,490
4,By Connor Finnegan2014-04-17 03:31:43 UTCNFL S...,1999


In [3]:
dataset.word_count.describe()

count    27643.000000
mean       609.997793
std        495.917809
min         37.000000
25%        286.000000
50%        470.000000
75%        784.000000
max       9551.000000
Name: word_count, dtype: float64

In [4]:
# Identify common words
freq = pd.Series(' '.join(dataset['content']).split()).value_counts()[:20]
freq

the       656835
to        374557
a         336374
of        334089
and       313973
in        231056
on        144661
for       141448
is        131023
that      124579
The       109596
with       95561
you        76430
as         74429
Image:     73433
at         68962
it         67840
—          60907
from       60347
be         59647
dtype: int64

In [5]:
# Libraries for text preprocessing
import re
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
#nltk.download('wordnet') 
from nltk.stem.wordnet import WordNetLemmatizer

stop_words = set(stopwords.words("english"))
new_words = ["using", "show", "result", "large", "also", "iv", "one", "two", "new", "previously", "shown"] 
stop_words = stop_words.union(new_words)

corpus = []
for i in range(0, dataset.shape[0]):
    text = re.sub('[^a-zA-Z]', ' ', dataset['content'][i])
    text = text.lower()
    
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    text=re.sub("(\\d|\\W)+"," ",text)
    text = text.split()
    
    #Lemmatisation
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(word) for word in text if not word in  
            stop_words] 
    text = " ".join(text)
    corpus.append(text)
    
corpus[3010]

'neha prakash utctaylor swift lorde biggest fan againlorde performs american music award nov los angeles nothing say bffs eva n eva fangirling besties american music award performance amirite lorde head banging ama performance yellow flicker beat hunger game soundtrack taylor swift perfectly played part best friend see lorde compare meeting pitbull meeting president obama lorde usual exorcism meet dancing thing stage sunday bonus point smearing lipstick face taylor busy losing marble year old performance get u wrong lorde royal something work lorde taylor bow joke creativity lorde taylor swift image imgur courtesy abc image imgur courtesy abc image imgur courtesy abc image imgur courtesy abc image imgur courtesy abc image imgur courtesy abc image imgur courtesy abc image imgur courtesy abc image imgur courtesy abc image courtesy abc sure lorde dementor amas professor snape snape november lorde grim reaper energy anyone performed pettyonc edition bleushock november lorde smeared lipstic

In [6]:
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
# % matplotlib inline
# wordcloud = WordCloud(
#                           background_color='white',
#                           stopwords=stop_words,
#                           max_words=100,
#                           max_font_size=50, 
#                           random_state=42
#                          ).generate(str(corpus))
# print(wordcloud)
# fig = plt.figure(1)
# plt.imshow(wordcloud)
# plt.axis('off')
# plt.show()
# fig.savefig("./data/word.png", dpi=900)

<wordcloud.wordcloud.WordCloud object at 0x0000013E018410F0>


<Figure size 640x480 with 1 Axes>

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
import re

cv = CountVectorizer(max_df=0.8, stop_words=stop_words, max_features=2**20, ngram_range=(1,3))
X = cv.fit_transform(corpus)
list(cv.vocabulary_.keys())[:10]

['clara',
 'moskowitz',
 'space',
 'com',
 'utc',
 'nasa',
 'grand',
 'challenge',
 'stop',
 'asteroid']

In [8]:
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import coo_matrix

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(X)
# get feature names
feature_names=cv.get_feature_names()

In [9]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    for idx, score in sorted_items:
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

keyword_list = []

for i in range(dataset.shape[0]):
    doc = corpus[i]

    #generate tf-idf for the given document
    tf_idf_vector = tfidf_transformer.transform(cv.transform([doc]))
    
    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())
    keywords = extract_topn_from_vector(feature_names,sorted_items,10)
    keyword_list.append((list(keywords.values())))

In [10]:
dataset['keywords'] = keyword_list

In [11]:
dataset['keywords'][0]

[0.463, 0.284, 0.177, 0.16, 0.124, 0.114, 0.108, 0.103, 0.084, 0.084]

In [12]:
def get_category(text: str):
    res = list()
    soup = BeautifulSoup(text, "html.parser")
    cats = soup.find_all('a', {'href': re.compile('/category/*')})
    for cat in cats:
        res.append(cat.get_text().lower())
    return " ".join(res)

res = []
for i in range(dataset.shape[0]):
    res.append(get_category(dataset['Page content'][i]))

In [13]:
dataset['category'] = res

In [14]:
dataset['category'][0]

'asteroid asteroids challenge earth space u.s. world'

In [15]:
count = CountVectorizer(ngram_range=(1, 1))

count.fit(res)
doc_bag = count.transform(res)

doc_bag = doc_bag.toarray()
print(doc_bag)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [16]:
dataset['category'] = list(doc_bag)

In [17]:
dataset.head()

Unnamed: 0,Id,Popularity,Page content,content,word_count,keywords,category
0,0,-1,"<html><head><div class=""article-info""> <span c...",Clara Moskowitz for Space.com 2013-06-19 15:0...,607,"[0.463, 0.284, 0.177, 0.16, 0.124, 0.114, 0.10...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,1,"<html><head><div class=""article-info""><span cl...",By Christina Warren2013-03-28 17:40:55 UTCGoog...,341,"[0.326, 0.326, 0.32, 0.306, 0.218, 0.205, 0.19...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,1,"<html><head><div class=""article-info""><span cl...",By Sam Laird2014-05-07 19:15:20 UTCBallin': 20...,1412,"[0.414, 0.185, 0.137, 0.136, 0.077, 0.076, 0.0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,-1,"<html><head><div class=""article-info""><span cl...",By Sam Laird2013-10-11 02:26:50 UTCCameraperso...,490,"[0.146, 0.138, 0.093, 0.092, 0.092, 0.091, 0.0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,-1,"<html><head><div class=""article-info""><span cl...",By Connor Finnegan2014-04-17 03:31:43 UTCNFL S...,1999,"[0.338, 0.32, 0.104, 0.097, 0.088, 0.085, 0.07...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [18]:
res = []
for i in range(dataset.shape[0]):
    unique_rate = len(set(dataset['content'][i].split(' '))) / len(dataset['content'][i].split(' '))
    res.append(unique_rate)
    
dataset['unique'] = res

In [19]:
dataset.head()

Unnamed: 0,Id,Popularity,Page content,content,word_count,keywords,category,unique
0,0,-1,"<html><head><div class=""article-info""> <span c...",Clara Moskowitz for Space.com 2013-06-19 15:0...,607,"[0.463, 0.284, 0.177, 0.16, 0.124, 0.114, 0.10...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.61285
1,1,1,"<html><head><div class=""article-info""><span cl...",By Christina Warren2013-03-28 17:40:55 UTCGoog...,341,"[0.326, 0.326, 0.32, 0.306, 0.218, 0.205, 0.19...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.595308
2,2,1,"<html><head><div class=""article-info""><span cl...",By Sam Laird2014-05-07 19:15:20 UTCBallin': 20...,1412,"[0.414, 0.185, 0.137, 0.136, 0.077, 0.076, 0.0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.504249
3,3,-1,"<html><head><div class=""article-info""><span cl...",By Sam Laird2013-10-11 02:26:50 UTCCameraperso...,490,"[0.146, 0.138, 0.093, 0.092, 0.092, 0.091, 0.0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.493878
4,4,-1,"<html><head><div class=""article-info""><span cl...",By Connor Finnegan2014-04-17 03:31:43 UTCNFL S...,1999,"[0.338, 0.32, 0.104, 0.097, 0.088, 0.085, 0.07...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.394697


In [20]:
def get_weekday(text: str) -> int: #input html format
    soup = BeautifulSoup(text)
    selector = "time"
    date = [i.text for i in soup.select(selector)][0]
    #print(soup.select(selector))
    #print(date)

    try:
        date = date.split()
        date = str(date[0] + ' ' + date[1])
    #print(date)

        d = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
        d = d.weekday()+1
    #print(d)
    except:
        d = 1
        
    return d

res = []
for i in range(dataset.shape[0]):
    res.append(get_weekday(dataset['Page content'][i]))
    
dataset['weekday'] = res

In [21]:
dataset.head()

Unnamed: 0,Id,Popularity,Page content,content,word_count,keywords,category,unique,weekday
0,0,-1,"<html><head><div class=""article-info""> <span c...",Clara Moskowitz for Space.com 2013-06-19 15:0...,607,"[0.463, 0.284, 0.177, 0.16, 0.124, 0.114, 0.10...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.61285,3
1,1,1,"<html><head><div class=""article-info""><span cl...",By Christina Warren2013-03-28 17:40:55 UTCGoog...,341,"[0.326, 0.326, 0.32, 0.306, 0.218, 0.205, 0.19...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.595308,4
2,2,1,"<html><head><div class=""article-info""><span cl...",By Sam Laird2014-05-07 19:15:20 UTCBallin': 20...,1412,"[0.414, 0.185, 0.137, 0.136, 0.077, 0.076, 0.0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.504249,3
3,3,-1,"<html><head><div class=""article-info""><span cl...",By Sam Laird2013-10-11 02:26:50 UTCCameraperso...,490,"[0.146, 0.138, 0.093, 0.092, 0.092, 0.091, 0.0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.493878,5
4,4,-1,"<html><head><div class=""article-info""><span cl...",By Connor Finnegan2014-04-17 03:31:43 UTCNFL S...,1999,"[0.338, 0.32, 0.104, 0.097, 0.088, 0.085, 0.07...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.394697,4


In [22]:
def get_title(text: str) -> str:    
    soup = BeautifulSoup(text, "html.parser")
    title = soup.find('h1',{'class':'title'})
    return title.get_text()

res = []
for i in range(dataset.shape[0]):
    time = 0
    for j in get_title(dataset['Page content'][i]).lower().split(" "):
        content = dataset['Page content'][i].lower().split(" ")
        for k in content:
            if j == k:
                time = time+1
        
    res.append(time/dataset['word_count'][i])
    
dataset['title_freq'] = res

In [23]:
dataset.head(10)

Unnamed: 0,Id,Popularity,Page content,content,word_count,keywords,category,unique,weekday,title_freq
0,0,-1,"<html><head><div class=""article-info""> <span c...",Clara Moskowitz for Space.com 2013-06-19 15:0...,607,"[0.463, 0.284, 0.177, 0.16, 0.124, 0.114, 0.10...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.61285,3,0.037891
1,1,1,"<html><head><div class=""article-info""><span cl...",By Christina Warren2013-03-28 17:40:55 UTCGoog...,341,"[0.326, 0.326, 0.32, 0.306, 0.218, 0.205, 0.19...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.595308,4,0.090909
2,2,1,"<html><head><div class=""article-info""><span cl...",By Sam Laird2014-05-07 19:15:20 UTCBallin': 20...,1412,"[0.414, 0.185, 0.137, 0.136, 0.077, 0.076, 0.0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.504249,3,0.042493
3,3,-1,"<html><head><div class=""article-info""><span cl...",By Sam Laird2013-10-11 02:26:50 UTCCameraperso...,490,"[0.146, 0.138, 0.093, 0.092, 0.092, 0.091, 0.0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.493878,5,0.010204
4,4,-1,"<html><head><div class=""article-info""><span cl...",By Connor Finnegan2014-04-17 03:31:43 UTCNFL S...,1999,"[0.338, 0.32, 0.104, 0.097, 0.088, 0.085, 0.07...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.394697,4,0.010505
5,5,-1,"<html><head><div class=""article-info""> <span c...",Brendan Greeley for Bloomberg 2013-11-21 18:0...,835,"[0.357, 0.251, 0.208, 0.196, 0.15, 0.134, 0.12...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.586826,4,0.079042
6,6,1,"<html><head><div class=""article-info""><span cl...",By Brian Anthony Hernandez2014-08-11 05:00:18 ...,141,"[0.308, 0.308, 0.303, 0.255, 0.182, 0.182, 0.1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.787234,1,0.191489
7,7,-1,"<html><head><div class=""article-info""><span cl...",By Sandra Gonzalez2014-11-20 00:30:41 UTCBill ...,317,"[0.511, 0.246, 0.158, 0.132, 0.129, 0.112, 0.1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.646688,4,0.135647
8,8,1,"<html><head><div class=""article-info""><span cl...",By Sara Afzal2013-09-30 04:30:01 UTCVending Ma...,319,"[0.414, 0.323, 0.315, 0.267, 0.241, 0.216, 0.1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.673981,1,0.100313
9,9,-1,"<html><head><div class=""article-info""><span cl...",By Jason Abbruzzese2014-02-06 15:49:35 UTCOnli...,418,"[0.295, 0.217, 0.141, 0.136, 0.133, 0.133, 0.1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.624402,4,0.057416


In [29]:
tmp = dataset.drop(columns=['Page content', 'content'])
tmp.head()

Unnamed: 0,Id,Popularity,word_count,keywords,category,unique,weekday,title_freq
0,0,-1,607,"[0.463, 0.284, 0.177, 0.16, 0.124, 0.114, 0.10...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.61285,3,0.037891
1,1,1,341,"[0.326, 0.326, 0.32, 0.306, 0.218, 0.205, 0.19...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.595308,4,0.090909
2,2,1,1412,"[0.414, 0.185, 0.137, 0.136, 0.077, 0.076, 0.0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.504249,3,0.042493
3,3,-1,490,"[0.146, 0.138, 0.093, 0.092, 0.092, 0.091, 0.0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.493878,5,0.010204
4,4,-1,1999,"[0.338, 0.32, 0.104, 0.097, 0.088, 0.085, 0.07...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.394697,4,0.010505


In [31]:
tmp.to_csv('./data/train_1.csv')

In [24]:
# X = []
# y = []
# for i in range(dataset.shape[0]):
#     tmp = []
#     tmp.append(dataset['title_freq'][i])
#     tmp.append(dataset['weekday'][i])
#     tmp.append(dataset['unique'][i])
#     tmp.append(dataset['word_count'][i])
#     tmp.extend(dataset['keywords'][i])
#     tmp.extend(dataset['category'][i])
#     X.append(tmp)
#     y.append(dataset['Popularity'][i])

In [25]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=0)