In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
dataset_name = 'online_news_popularity'

In [3]:
input_dir = './data'
inp_fname = 'OnlineNewsPopularity.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')

# Read Data

In [14]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,http://mashable.com/2013/01/07/amazon-instant-...,731.0,12.0,219.0,0.663594,1.0,0.815385,4.0,2.0,1.0,...,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593
1,http://mashable.com/2013/01/07/ap-samsung-spon...,731.0,9.0,255.0,0.604743,1.0,0.791946,3.0,1.0,1.0,...,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0,711
2,http://mashable.com/2013/01/07/apple-40-billio...,731.0,9.0,211.0,0.57513,1.0,0.663866,3.0,1.0,1.0,...,0.1,1.0,-0.466667,-0.8,-0.133333,0.0,0.0,0.5,0.0,1500
3,http://mashable.com/2013/01/07/astronaut-notre...,731.0,9.0,531.0,0.503788,1.0,0.665635,9.0,0.0,1.0,...,0.136364,0.8,-0.369697,-0.6,-0.166667,0.0,0.0,0.5,0.0,1200
4,http://mashable.com/2013/01/07/att-u-verse-apps/,731.0,13.0,1072.0,0.415646,1.0,0.54089,19.0,19.0,20.0,...,0.033333,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364,0.045455,0.136364,505


In [15]:
data.shape

(39644, 61)

In [16]:
id_col = "id"
target_col = "shares"

# Insert Id Column

In [17]:
# While the url column can be used as id, the values are too lengthy. So we will just create our own unique ids
# so let's delete url
del data['url']

In [18]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())
data[id_col] = data[id_col].astype(str)

   id   timedelta   n_tokens_title   n_tokens_content   n_unique_tokens  \
0   0       731.0             12.0              219.0          0.663594   
1   1       731.0              9.0              255.0          0.604743   
2   2       731.0              9.0              211.0          0.575130   
3   3       731.0              9.0              531.0          0.503788   
4   4       731.0             13.0             1072.0          0.415646   

    n_non_stop_words   n_non_stop_unique_tokens   num_hrefs   num_self_hrefs  \
0                1.0                   0.815385         4.0              2.0   
1                1.0                   0.791946         3.0              1.0   
2                1.0                   0.663866         3.0              1.0   
3                1.0                   0.665635         9.0              0.0   
4                1.0                   0.540890        19.0             19.0   

    num_imgs  ...   min_positive_polarity   max_positive_polarity  \

# Preprocess

In [35]:
# Remove whitespace around column names
data.columns = [s.strip() for s in data.columns]

In [19]:
# Older published articles have more time to be shared, so timedelta is going to be highly correlated with shares
# We will limit the data to those where timedelta is at least 500. So then our model can learn to use other
# features to predict # of shares
# 500 is arbitrarily chosen number - no science behind it. 

In [36]:
data = data[data['timedelta'] >= 500].copy()

In [38]:
data.shape

(12108, 61)

# Save Main Data File

In [41]:
data.to_csv(outp_fname, index=False)