Competition on Kaggle: https://www.kaggle.com/c/how-good-is-your-medium-article/leaderboard

In [46]:
!pip install tqdm



distributed 1.21.8 requires msgpack, which is not installed.


In [84]:
import os
import codecs
import json
import seaborn as sns
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
from scipy.sparse import csr_matrix, hstack

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.linear_model import Ridge
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from datetime import date

In [48]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ' '.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [49]:
def read_json_line(line=None):
    result = None
    try:        
        result = json.loads(line)
    except Exception as e:      
        # Find the offending character index:
        idx_to_replace = int(str(e).split(' ')[-1].replace(')',''))      
        # Remove the offending character:
        new_line = list(line)
        new_line[idx_to_replace] = ' '
        new_line = ''.join(new_line)     
        return read_json_line(line=new_line)
    return result

In [50]:
def extract_features_and_write(path_to_data,
                               inp_filename, is_train=True):
    ident = 0
    features = ['_id', 'content', 'published', 'title', 'author']
    prefix = 'train' if is_train else 'test'
    feature_files = [open(os.path.join(path_to_data,
                                       '{}_{}.txt'.format(prefix, feat)),
                          'w', encoding="utf-8")
                     for feat in features]
 
    with open(os.path.join(path_to_data, inp_filename), 
              encoding='utf-8') as inp_json_file:
    
        for line in tqdm_notebook(inp_json_file):
            dict_data = {}
            json_data = read_json_line(line)
            for i in range(len(feature_files)):
                dict_data[ident] = json_data[features[i]]
                feature_files[i].write(str(dict_data) + '\n')
            ident += 1
    for i in feature_files:
        i.close()

In [55]:
PATH_TO_DATA = 'C:\mlcourse\Databases\kaggle_medium'

In [56]:
extract_features_and_write(PATH_TO_DATA, 'train.json', is_train=True)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [57]:
extract_features_and_write(PATH_TO_DATA, 'test.json', is_train=False)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




## Add the following groups of features:

- Tf-Idf with article content (ngram_range=(1, 2), max_features=100000 but you can try adding more)
- Tf-Idf with article titles (ngram_range=(1, 2), max_features=100000 but you can try adding more)
- Time features: publication hour, whether it's morning, day, night, whether it's a weekend
- Bag of authors (i.e. One-Hot-Encoded author names)

#### Extacting data

In [58]:
prefix = 'train'
features = ['_id', 'content', 'published', 'title', 'author']

l_files = [os.path.join(PATH_TO_DATA,
                                       '{}_{}.txt'.format(prefix, feat))   
                     for feat in features]

In [None]:
%%time
df_all_data = pd.DataFrame({})
for k in range(0, len(features)):
    f = codecs.open(l_files[k], "r", "utf-8")
    arr = []
    count = 0
    for line in f:
        count += 1
        arr.append(list(eval(line[:-2]).values())[0])
    f.close()
    ser_buf = pd.Series(arr)
    df_all_data[features[k]] = ser_buf
df_all_data['content'] = df_all_data['content'].map(strip_tags)

Adding a popularity column

In [None]:
data = pd.read_csv('C:\mlcourse\Databases\kaggle_medium/train_log1p_recommends.csv', sep=',')

In [None]:
df_all_data = df_all_data.join(data)

## Strip content: \xa0, \u200b

In [None]:
dddata = df_all_data.copy(deep=True)

In [122]:
df_all_data = dddata.copy(deep=True)

## Division publications by hour, whether it's morning, day, night, whether it's a weekend


In [125]:
df_all_data.head()

Unnamed: 0,_id,content,published,title,author,id,log_recommends,date,time,time_sec,time_of_day,week_day,weekend,twitter
0,https://medium.com/policy/medium-terms-of-serv...,"Medium Everyone’s stories and ideas Aug 13, 20...",2012-08-13T22:54:53.510Z,Medium Terms of Service – Medium Policy – Medium,"{'name': None, 'url': 'https://medium.com/@Med...",8,9.01201,2012-08-13,22:54:53,82493,E,0,False,@Medium
1,https://medium.com/policy/amendment-to-medium-...,"Medium Everyone’s stories and ideas Aug 2, 201...",2015-08-03T07:44:50.331Z,Amendment to Medium Terms of Service Applicabl...,"{'name': None, 'url': 'https://medium.com/@Med...",14,3.49651,2015-08-03,07:44:50,27890,M,0,False,@Medium
2,https://medium.com/@aelcenganda/%E9%96%A9%E6%9...,"Yun-Chen Chien（簡韻真） Nobody in @g0v.tw, PM in s...",2017-02-05T13:08:17.410Z,走入山與海之間：閩東大刀會和兩岸走私 – Yun-Chen Chien（簡韻真） – Medium,"{'name': None, 'url': 'https://medium.com/@ael...",19,0.69315,2017-02-05,13:08:17,47297,D,6,True,@aelcenganda
3,https://medium.com/what-comes-to-mind/how-fast...,Vaibhav Khulbe Android App Developer | I write...,2017-05-06T08:16:30.776Z,How fast can a camera get? – What comes to min...,"{'name': None, 'url': 'https://medium.com/@vai...",22,1.38629,2017-05-06,08:16:30,29790,M,5,True,@vaibhav_khulbe
4,https://medium.com/what-comes-to-mind/a-game-f...,Vaibhav Khulbe Android App Developer | I write...,2017-06-04T14:46:25.772Z,A game for the lonely fox – What comes to mind...,"{'name': None, 'url': 'https://medium.com/@vai...",29,1.94591,2017-06-04,14:46:25,53185,D,6,True,@vaibhav_khulbe


#### Creating the columns date and time and preprocessing author

In [123]:
def pub_to_time(publicate):
    return publicate[11:-5]

def pub_to_date(publicate):
    return publicate[:10] 

def time_to_sec(time_s):
    return ((int(time_s[:2])* 60 * 60) + (int(time_s[3:5])) * 60 + (int(time_s[6:])))

# may be needed other division
def sec_to_tofd(time_sec):
    x = 3600
    if(time_sec > 18 * x):
        return 'E'
    elif(time_sec > 12 * x):
        return 'D'
    elif(time_sec > 6 * x):
        return 'M'
    else:
        return 'N'
    
def day_of_week(d):
    days = date(int(d[:4]), int(d[5:7]), int(d[8:]))
    return(days.weekday())

def is_weekend(d):
    if (d > 4):
        return True
    return False

def twitter_data(author_data):
    return author['twitter']

def twitter_data(url_data):
    return author['url']

def publish_to_str(publ):
    return list(publ.values())[0]

In [124]:
df_all_data['published'] = df_all_data['published'].map(publish_to_str)
df_all_data['date'] = df_all_data['published'].map(pub_to_date)
df_all_data['time'] = df_all_data['published'].map(pub_to_time)
df_all_data['time_sec'] = df_all_data['time'].map(time_to_sec)
df_all_data['time_of_day'] = df_all_data['time_sec'].map(sec_to_tofd)
df_all_data['week_day'] = df_all_data['date'].map(day_of_week)
df_all_data['weekend'] = df_all_data['week_day'].map(is_weekend)
df_all_data['twitter'] = df_all_data['author'].map(twitter_data)
df_all_data['url'] = df_all_data['author'].map(url_data)

AttributeError: 'str' object has no attribute 'values'

In [92]:
# Not here
feats = ['content', 'title', 'time_of_day', 'week_day', 'weekend', 'twitter', 'log_recommends']
df = df_all_data[feats]
#X_train, y_train = train_test_split(df, random_state=17)

In [118]:
len(df_all_data[df_all_data['twitter'] == np.nan])

0

In [119]:
df_all_data

Unnamed: 0,_id,content,published,title,author,id,log_recommends,date,time,time_sec,time_of_day,week_day,weekend,twitter
0,https://medium.com/policy/medium-terms-of-serv...,"Medium Everyone’s stories and ideas Aug 13, 20...",2012-08-13T22:54:53.510Z,Medium Terms of Service – Medium Policy – Medium,"{'name': None, 'url': 'https://medium.com/@Med...",8,9.01201,2012-08-13,22:54:53,82493,E,0,False,@Medium
1,https://medium.com/policy/amendment-to-medium-...,"Medium Everyone’s stories and ideas Aug 2, 201...",2015-08-03T07:44:50.331Z,Amendment to Medium Terms of Service Applicabl...,"{'name': None, 'url': 'https://medium.com/@Med...",14,3.49651,2015-08-03,07:44:50,27890,M,0,False,@Medium
2,https://medium.com/@aelcenganda/%E9%96%A9%E6%9...,"Yun-Chen Chien（簡韻真） Nobody in @g0v.tw, PM in s...",2017-02-05T13:08:17.410Z,走入山與海之間：閩東大刀會和兩岸走私 – Yun-Chen Chien（簡韻真） – Medium,"{'name': None, 'url': 'https://medium.com/@ael...",19,0.69315,2017-02-05,13:08:17,47297,D,6,True,@aelcenganda
3,https://medium.com/what-comes-to-mind/how-fast...,Vaibhav Khulbe Android App Developer | I write...,2017-05-06T08:16:30.776Z,How fast can a camera get? – What comes to min...,"{'name': None, 'url': 'https://medium.com/@vai...",22,1.38629,2017-05-06,08:16:30,29790,M,5,True,@vaibhav_khulbe
4,https://medium.com/what-comes-to-mind/a-game-f...,Vaibhav Khulbe Android App Developer | I write...,2017-06-04T14:46:25.772Z,A game for the lonely fox – What comes to mind...,"{'name': None, 'url': 'https://medium.com/@vai...",29,1.94591,2017-06-04,14:46:25,53185,D,6,True,@vaibhav_khulbe
5,https://medium.com/@pettykate/now-thats-comedy...,Kate Reed Petty i like you. pettykate.com Apr ...,2017-04-02T16:21:15.171Z,Now That’s Comedy – Kate Reed Petty – Medium,"{'name': None, 'url': 'https://medium.com/@pet...",47,2.07944,2017-04-02,16:21:15,58875,D,6,True,@PettyKate
6,https://medium.com/@exedre/che-fine-ha-fatto-g...,exedre author ● about.me/exedre ● esomma@ieee....,2016-08-15T04:16:02.103Z,Che fine ha fatto «Gola Profonda»? – exedre – ...,"{'name': None, 'url': 'https://medium.com/@exe...",52,1.79176,2016-08-15,04:16:02,15362,N,0,False,@exedre
7,https://medium.com/interactive-mind/airbnb-gue...,Raghav Haran Helping ambitious people build ca...,2015-01-14T21:31:07.568Z,Airbnb Guerilla Usability Testing – Interactiv...,"{'name': None, 'url': 'https://medium.com/@Rag...",90,7.00397,2015-01-14,21:31:07,77467,E,2,False,@RaghavHaran
8,https://medium.com/bridge-collection/a-guerill...,Francine Lee UX + Visual: http://francine.io F...,2014-02-11T04:11:54.771Z,A Guerilla Usability Test on Dropbox Photos – ...,"{'name': None, 'url': 'https://medium.com/@___...",91,7.17089,2014-02-11,04:11:54,15114,N,1,False,@___fl
9,https://medium.com/the-mission/how-to-get-any-...,Raghav Haran Helping ambitious people build ca...,2015-10-25T02:58:05.551Z,How to Get Any Job You Want (even if you’re un...,"{'name': None, 'url': 'https://medium.com/@Rag...",98,8.64840,2015-10-25,02:58:05,10685,N,6,True,@RaghavHaran


#### Creating bag of 'Bag of authors'

In [99]:
label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse=False)
df['twitter'] = label_encoder.fit_transform(df['twitter'])


TypeError: '<' not supported between instances of 'NoneType' and 'str'

In [None]:
author_encoded_categorical_columns = pd.DataFrame(onehot_encoder.fit_transform(df['twitter']))
author_encoded_categorical_columns.head()

## Creating Tf-Idf 

- with article content (ngram_range=(1, 2), max_features=100000 but you can try adding more)
- with article titles (ngram_range=(1, 2), max_features=100000 but you can try adding more)


In [134]:
%%time

count_vect = CountVectorizer(ngram_range=(1, 2), max_features= 100000)
df_counts_title = count_vect.fit_transform(df.title)

tf_transformer_title = TfidfTransformer(use_idf=False).fit(df_counts_title)
df_tf_title = tf_transformer_title.transform(df_counts_title)

tfidf_transformer = TfidfTransformer()
df_tfidf_title = tfidf_transformer.fit_transform(df_counts_title)
df_tfidf_title.shape

Wall time: 6.38 s


In [135]:
%%time

count_vect = CountVectorizer(ngram_range=(1, 2), max_features= 100000)
df_counts_content = count_vect.fit_transform(df.content)

tf_transformer_content = TfidfTransformer(use_idf=False).fit(df_counts_content)
df_tf_content = tf_transformer_content.transform(df_counts_content)

tfidf_transformer = TfidfTransformer()
df_tfidf_content = tfidf_transformer.fit_transform(df_counts_content)
df_tfidf_content.shape

Wall time: 1h 27min 30s
Parser   : 191 ms

Sparsing train and test data

In [None]:
X_train_sparse = hstack([author_encoded_categorical_columns, df_tfidf_title,
                         df_tfidf_content, 
                         df]).tocsr()