Competition on Kaggle: https://www.kaggle.com/c/how-good-is-your-medium-article/leaderboard

In [1]:
import os
import codecs
import json
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import Ridge

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

In [92]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ' '.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [3]:
def read_json_line(line=None):
    result = None
    try:        
        result = json.loads(line)
    except Exception as e:      
        # Find the offending character index:
        idx_to_replace = int(str(e).split(' ')[-1].replace(')',''))      
        # Remove the offending character:
        new_line = list(line)
        new_line[idx_to_replace] = ' '
        new_line = ''.join(new_line)     
        return read_json_line(line=new_line)
    return result

In [4]:
def extract_features_and_write(path_to_data,
                               inp_filename, is_train=True):
    ident = 0
    features = ['content', 'published', 'title', 'author']
    prefix = 'train' if is_train else 'test'
    feature_files = [open(os.path.join(path_to_data,
                                       '{}_{}.txt'.format(prefix, feat)),
                          'w', encoding="utf-8")
                     for feat in features]
    feature_light_files = [open(os.path.join(path_to_data,
                                       '{}_{}_light.txt'.format(prefix, feat)),
                          'w', encoding="utf-8")                     
                     for feat in features]
    
    with open(os.path.join(path_to_data, inp_filename), 
              encoding='utf-8') as inp_json_file:
    
        for line in tqdm_notebook(inp_json_file):
            dict_data = {}
            json_data = read_json_line(line)
            for i in range(len(feature_files)):
                dict_data[ident] = json_data[features[i]]
                feature_files[i].write(str(dict_data) + '\n')
                if (ident < 5000):
                    feature_light_files[i].write(str(dict_data) + '\n')
            ident += 1
    for i in feature_light_files:
        i.close()
    for i in feature_files:
        i.close()

In [5]:
PATH_TO_DATA = 'C:\mlcourse\Databases\kaggle_medium'

In [6]:
extract_features_and_write(PATH_TO_DATA, 'train.json', is_train=True)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [7]:
extract_features_and_write(PATH_TO_DATA, 'test.json', is_train=False)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




## Add the following groups of features:

- Tf-Idf with article content (ngram_range=(1, 2), max_features=100000 but you can try adding more)
- Tf-Idf with article titles (ngram_range=(1, 2), max_features=100000 but you can try adding more)
- Time features: publication hour, whether it's morning, day, night, whether it's a weekend
- Bag of authors (i.e. One-Hot-Encoded author names)

In [20]:
prefix = 'train'
features = ['content', 'published', 'title', 'author']

l_files = [os.path.join(PATH_TO_DATA,
                                       '{}_{}_light.txt'.format(prefix, feat))   
                     for feat in features]

In [148]:
%%time
df = pd.DataFrame({})
for k in range(0, len(features)):
    f = codecs.open(l_files[k], "r", "utf-8")
    arr = []
    count = 0
    for line in f:
        count += 1
        arr.append(list(eval(line[:-2]).values())[0])
    f.close()
    ser_buf = pd.Series(arr)
    df[features[k]] = ser_buf
df['content'] = df['content'].map(strip_tags)    

Wall time: 31.5 s


## Strip content: \xa0, \u200b

## Division publications by hour, whether it's morning, day, night, whether it's a weekend


In [150]:
for i in df.index:
    df['published'][i] = list(df['published'].values[i].values())[0]

In [152]:
df['published']

0       2012-08-13T22:54:53.510Z
1       2015-08-03T07:44:50.331Z
2       2017-02-05T13:08:17.410Z
3       2017-05-06T08:16:30.776Z
4       2017-06-04T14:46:25.772Z
5       2017-04-02T16:21:15.171Z
6       2016-08-15T04:16:02.103Z
7       2015-01-14T21:31:07.568Z
8       2014-02-11T04:11:54.771Z
9       2015-10-25T02:58:05.551Z
10      2016-08-15T15:31:13.601Z
11      2016-08-09T21:01:06.303Z
12      2016-09-08T15:47:57.336Z
13      2016-09-30T18:05:35.950Z
14      2017-06-27T15:49:22.909Z
15      2015-07-13T06:52:44.618Z
16      2017-05-01T13:22:43.785Z
17      2016-08-31T17:11:24.263Z
18      2017-06-30T07:55:55.103Z
19      2016-12-13T23:29:35.556Z
20      2016-01-27T22:19:05.027Z
21      2016-12-14T01:15:02.122Z
22      2016-09-05T22:02:40.326Z
23      2016-12-13T17:59:40.527Z
24      2017-05-02T17:28:39.120Z
25      2016-08-30T23:43:24.940Z
26      2017-04-26T02:50:29.511Z
27      2016-06-18T06:54:10.331Z
28      2016-05-17T17:52:00.960Z
29      2017-04-17T16:29:28.306Z
          

## Creating Tf-Idf with article titles (ngram_range=(1, 2), max_features=100000 but you can try adding more)

In [53]:
count_vect = CountVectorizer(ngram_range=(1, 2), max_features= 100000)
X_train_counts = count_vect.fit_transform(df.title)
X_train_counts.shape

(62313, 100000)

In [31]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(62313, 66033)

In [32]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(62313, 66033)

In [None]:
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))
    
text_clf = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                   ('clf', MultinomialNB()),
])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
twenty_test = fetch_20newsgroups(subset='test',
     categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)    

text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, n_iter=5, random_state=42)),
    ])
_ = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)