Competition on Kaggle: https://www.kaggle.com/c/how-good-is-your-medium-article/leaderboard

In [46]:
!pip install tqdm



distributed 1.21.8 requires msgpack, which is not installed.


In [6]:
import os
import codecs
import json
import seaborn as sns
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
from scipy.sparse import csr_matrix, hstack

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.linear_model import Ridge
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from datetime import date

In [7]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ' '.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [8]:
def read_json_line(line=None):
    result = None
    try:        
        result = json.loads(line)
    except Exception as e:      
        # Find the offending character index:
        idx_to_replace = int(str(e).split(' ')[-1].replace(')',''))      
        # Remove the offending character:
        new_line = list(line)
        new_line[idx_to_replace] = ' '
        new_line = ''.join(new_line)     
        return read_json_line(line=new_line)
    return result

In [9]:
def extract_features_and_write(path_to_data,
                               inp_filename, is_train=True):
    ident = 0
    features = ['_id', 'content', 'published', 'title', 'author']
    prefix = 'train' if is_train else 'test'
    feature_files = [open(os.path.join(path_to_data,
                                       '{}_{}.txt'.format(prefix, feat)),
                          'w', encoding="utf-8")
                     for feat in features]
 
    with open(os.path.join(path_to_data, inp_filename), 
              encoding='utf-8') as inp_json_file:
    
        for line in tqdm_notebook(inp_json_file):
            dict_data = {}
            json_data = read_json_line(line)
            for i in range(len(feature_files)):
                dict_data[ident] = json_data[features[i]]
                feature_files[i].write(str(dict_data) + '\n')
            ident += 1
    for i in feature_files:
        i.close()

In [10]:
PATH_TO_DATA = 'C:\mlcourse\Databases\kaggle_medium'

In [6]:
extract_features_and_write(PATH_TO_DATA, 'train.json', is_train=True)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [7]:
extract_features_and_write(PATH_TO_DATA, 'test.json', is_train=False)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




## Add the following groups of features:

- Tf-Idf with article content (ngram_range=(1, 2), max_features=100000 but you can try adding more)
- Tf-Idf with article titles (ngram_range=(1, 2), max_features=100000 but you can try adding more)
- Time features: publication hour, whether it's morning, day, night, whether it's a weekend
- Bag of authors (i.e. One-Hot-Encoded author names)

#### Extacting data

In [11]:
prefix = 'train'
features = ['_id', 'content', 'published', 'title', 'author']

l_files = [os.path.join(PATH_TO_DATA,
                                       '{}_{}.txt'.format(prefix, feat))   
                     for feat in features]

In [12]:
%%time
df_all_data = pd.DataFrame({})
for k in range(0, len(features)):
    f = codecs.open(l_files[k], "r", "utf-8")
    arr = []
    count = 0
    for line in f:
        count += 1
        arr.append(list(eval(line[:-2]).values())[0])
    f.close()
    ser_buf = pd.Series(arr)
    df_all_data[features[k]] = ser_buf

Wall time: 27min 11s


In [13]:
df_all_data['content'] = df_all_data['content'].map(strip_tags)

Adding a popularity column

In [14]:
data = pd.read_csv('C:\mlcourse\Databases\kaggle_medium/train_log1p_recommends.csv', sep=',')

In [15]:
df_all_data = df_all_data.join(data)

## Strip content: \xa0, \u200b

In [16]:
dddata = df_all_data.copy(deep=True)

In [25]:
df_all_data = dddata.copy(deep=True)

## Division publications by hour, whether it's morning, day, night, whether it's a weekend


In [26]:
df_all_data.head()

Unnamed: 0,_id,content,published,title,author,id,log_recommends
0,https://medium.com/policy/medium-terms-of-serv...,"Medium Everyone’s stories and ideas Aug 13, 20...",{'$date': '2012-08-13T22:54:53.510Z'},Medium Terms of Service – Medium Policy – Medium,"{'name': None, 'url': 'https://medium.com/@Med...",8,9.01201
1,https://medium.com/policy/amendment-to-medium-...,"Medium Everyone’s stories and ideas Aug 2, 201...",{'$date': '2015-08-03T07:44:50.331Z'},Amendment to Medium Terms of Service Applicabl...,"{'name': None, 'url': 'https://medium.com/@Med...",14,3.49651
2,https://medium.com/@aelcenganda/%E9%96%A9%E6%9...,"Yun-Chen Chien（簡韻真） Nobody in @g0v.tw, PM in s...",{'$date': '2017-02-05T13:08:17.410Z'},走入山與海之間：閩東大刀會和兩岸走私 – Yun-Chen Chien（簡韻真） – Medium,"{'name': None, 'url': 'https://medium.com/@ael...",19,0.69315
3,https://medium.com/what-comes-to-mind/how-fast...,Vaibhav Khulbe Android App Developer | I write...,{'$date': '2017-05-06T08:16:30.776Z'},How fast can a camera get? – What comes to min...,"{'name': None, 'url': 'https://medium.com/@vai...",22,1.38629
4,https://medium.com/what-comes-to-mind/a-game-f...,Vaibhav Khulbe Android App Developer | I write...,{'$date': '2017-06-04T14:46:25.772Z'},A game for the lonely fox – What comes to mind...,"{'name': None, 'url': 'https://medium.com/@vai...",29,1.94591


#### Creating the columns date and time and preprocessing author

In [27]:
def pub_to_time(publicate):
    return publicate[11:-5]

def pub_to_date(publicate):
    return publicate[:10] 

def time_to_sec(time_s):
    return ((int(time_s[:2])* 60 * 60) + (int(time_s[3:5])) * 60 + (int(time_s[6:])))

# may be needed other division
def sec_to_tofd(time_sec):
    x = 3600
    if(time_sec > 18 * x):
        return 'E'
    elif(time_sec > 12 * x):
        return 'D'
    elif(time_sec > 6 * x):
        return 'M'
    else:
        return 'N'
    
def day_of_week(d):
    days = date(int(d[:4]), int(d[5:7]), int(d[8:]))
    return(days.weekday())

def is_weekend(d):
    if (d > 4):
        return True
    return False

def twitter_data(author_data):
    return author_data['twitter']

def url_data(author_data):
    return author_data['url']

def publish_to_str(publ):
    return list(publ.values())[0]

In [28]:
df_all_data['published'] = df_all_data['published'].map(publish_to_str)
df_all_data['date'] = df_all_data['published'].map(pub_to_date)
df_all_data['time'] = df_all_data['published'].map(pub_to_time)
df_all_data['time_sec'] = df_all_data['time'].map(time_to_sec)
df_all_data['time_of_day'] = df_all_data['time_sec'].map(sec_to_tofd)
df_all_data['week_day'] = df_all_data['date'].map(day_of_week)
df_all_data['weekend'] = df_all_data['week_day'].map(is_weekend)
df_all_data['twitter'] = df_all_data['author'].map(twitter_data)
df_all_data['url'] = df_all_data['author'].map(url_data)

In [92]:
# Not here
feats = ['content', 'title', 'time_of_day', 'week_day', 'weekend', 'twitter', 'log_recommends']
df = df_all_data[feats]
#X_train, y_train = train_test_split(df, random_state=17)

In [36]:
df_all_data.head()

Unnamed: 0,_id,content,published,title,author,id,log_recommends,date,time,time_sec,time_of_day,week_day,weekend,twitter,url
0,https://medium.com/policy/medium-terms-of-serv...,"Medium Everyone’s stories and ideas Aug 13, 20...",2012-08-13T22:54:53.510Z,Medium Terms of Service – Medium Policy – Medium,"{'name': None, 'url': 'https://medium.com/@Med...",8,9.01201,2012-08-13,22:54:53,82493,E,0,False,@Medium,5212
1,https://medium.com/policy/amendment-to-medium-...,"Medium Everyone’s stories and ideas Aug 2, 201...",2015-08-03T07:44:50.331Z,Amendment to Medium Terms of Service Applicabl...,"{'name': None, 'url': 'https://medium.com/@Med...",14,3.49651,2015-08-03,07:44:50,27890,M,0,False,@Medium,5212
2,https://medium.com/@aelcenganda/%E9%96%A9%E6%9...,"Yun-Chen Chien（簡韻真） Nobody in @g0v.tw, PM in s...",2017-02-05T13:08:17.410Z,走入山與海之間：閩東大刀會和兩岸走私 – Yun-Chen Chien（簡韻真） – Medium,"{'name': None, 'url': 'https://medium.com/@ael...",19,0.69315,2017-02-05,13:08:17,47297,D,6,True,@aelcenganda,8240
3,https://medium.com/what-comes-to-mind/how-fast...,Vaibhav Khulbe Android App Developer | I write...,2017-05-06T08:16:30.776Z,How fast can a camera get? – What comes to min...,"{'name': None, 'url': 'https://medium.com/@vai...",22,1.38629,2017-05-06,08:16:30,29790,M,5,True,@vaibhav_khulbe,30748
4,https://medium.com/what-comes-to-mind/a-game-f...,Vaibhav Khulbe Android App Developer | I write...,2017-06-04T14:46:25.772Z,A game for the lonely fox – What comes to mind...,"{'name': None, 'url': 'https://medium.com/@vai...",29,1.94591,2017-06-04,14:46:25,53185,D,6,True,@vaibhav_khulbe,30748


In [31]:
df_all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62313 entries, 0 to 62312
Data columns (total 15 columns):
_id               62313 non-null object
content           62313 non-null object
published         62313 non-null object
title             62313 non-null object
author            62313 non-null object
id                62313 non-null int64
log_recommends    62313 non-null float64
date              62313 non-null object
time              62313 non-null object
time_sec          62313 non-null int64
time_of_day       62313 non-null object
week_day          62313 non-null int64
weekend           62313 non-null bool
twitter           49472 non-null object
url               62313 non-null object
dtypes: bool(1), float64(1), int64(3), object(10)
memory usage: 6.7+ MB


#### Creating bag of 'Bag of authors'

In [33]:
label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse=False)
df_all_data['url'] = label_encoder.fit_transform(df_all_data['url'])


In [49]:
df_all_data.url[df_all_data['url'] == 1]

14328    1
14329    1
32889    1
32890    1
Name: url, dtype: int64

In [51]:
author_encoded_categorical_columns = pd.DataFrame(onehot_encoder.fit_transform((df_all_data['url'].values).reshape(-1, 1)))
author_encoded_categorical_columns.head()

SyntaxError: invalid syntax (<ipython-input-51-84ed47bd712a>, line 2)

## Creating Tf-Idf 

- with article content (ngram_range=(1, 2), max_features=100000 but you can try adding more)
- with article titles (ngram_range=(1, 2), max_features=100000 but you can try adding more)


In [39]:
%%time

count_vect = CountVectorizer(ngram_range=(1, 2), max_features= 100000)
df_counts_title = count_vect.fit_transform(df_all_data.title)

tf_transformer_title = TfidfTransformer(use_idf=False).fit(df_counts_title)
df_tf_title = tf_transformer_title.transform(df_counts_title)

tfidf_transformer = TfidfTransformer()
df_tfidf_title = tfidf_transformer.fit_transform(df_counts_title)
df_tfidf_title.shape

Wall time: 9.95 s


In [40]:
%%time

count_vect = CountVectorizer(ngram_range=(1, 2), max_features= 100000)
df_counts_content = count_vect.fit_transform(df_all_data.content)

tf_transformer_content = TfidfTransformer(use_idf=False).fit(df_counts_content)
df_tf_content = tf_transformer_content.transform(df_counts_content)

tfidf_transformer = TfidfTransformer()
df_tfidf_content = tfidf_transformer.fit_transform(df_counts_content)
df_tfidf_content.shape

Wall time: 38min 13s


In [67]:
df_tfidf_content.shape, df_tfidf_title.shape, author_encoded_categorical_columns.shape

((62313, 100000), (62313, 100000), (62313, 32182))

Sparsing train and test data

In [None]:
x1 = np.vstack(author_encoded_categorical_columns)


In [None]:
x2 = np.vstack(df_tfidf_content)


In [None]:
x3 = np.vstack(df_tfidf_title)


In [61]:
%%time
X_train_sparse1 = hstack([author_encoded_categorical_columns,
                         df_tfidf_content]).tocsr()

Wall time: 1min 33s


In [66]:
X_train_sparse2 = hstack([df_tfidf_content, 
                          df_tfidf_title]).tocsr()

MemoryError: 

In [None]:
%%time
X_train_sparse = hstack([X_train_sparse1,
                         df_tfidf_content, df_all_data]).tocsr()