Competition on Kaggle: https://www.kaggle.com/c/how-good-is-your-medium-article/leaderboard

In [13]:
import os
import codecs
import json
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import Ridge

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

In [2]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [3]:
def read_json_line(line=None):
    result = None
    try:        
        result = json.loads(line)
    except Exception as e:      
        # Find the offending character index:
        idx_to_replace = int(str(e).split(' ')[-1].replace(')',''))      
        # Remove the offending character:
        new_line = list(line)
        new_line[idx_to_replace] = ' '
        new_line = ''.join(new_line)     
        return read_json_line(line=new_line)
    return result

In [63]:
def extract_features_and_write(path_to_data,
                               inp_filename, is_train=True):
    ident = 0
    features = ['content', 'published', 'title', 'author']
    prefix = 'train' if is_train else 'test'
    feature_files = [open(os.path.join(path_to_data,
                                       '{}_{}.txt'.format(prefix, feat)),
                          'w', encoding="utf-8")
                     for feat in features]
    feature_light_files = [open(os.path.join(path_to_data,
                                       '{}_{}_light.txt'.format(prefix, feat)),
                          'w', encoding="utf-8")                     
                     for feat in features]
    
    with open(os.path.join(path_to_data, inp_filename), 
              encoding='utf-8') as inp_json_file:
    
        for line in tqdm_notebook(inp_json_file):
            dict_data = {}
            json_data = read_json_line(line)
            for i in range(len(feature_files)):
                dict_data[ident] = json_data[features[i]]
                feature_files[i].write(str(dict_data))
                if (ident < 5000):
                    feature_light_files[i].write(str(dict_data) + "/n")
            ident += 1
    for i in feature_light_files:
        i.close()
    for i in feature_files:
        i.close()

In [64]:
PATH_TO_DATA = 'C:\mlcourse\Databases\kaggle_medium'

In [None]:
extract_features_and_write(PATH_TO_DATA, 'train.json', is_train=True)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [None]:
extract_features_and_write(PATH_TO_DATA, 'test.json', is_train=False)

## Add the following groups of features:

- Tf-Idf with article content (ngram_range=(1, 2), max_features=100000 but you can try adding more)
- Tf-Idf with article titles (ngram_range=(1, 2), max_features=100000 but you can try adding more)
- Time features: publication hour, whether it's morning, day, night, whether it's a weekend
- Bag of authors (i.e. One-Hot-Encoded author names)

In [7]:
prefix = 'train'
features = ['content', 'published', 'title', 'author']

files = [os.path.join(PATH_TO_DATA,
                                       '{}_{}.txt'.format(prefix, feat))                                         
                     for feat in features]

In [34]:
%%time
df = pd.DataFrame({})
for k in range(1, len(features)):
    array = ''
    f = codecs.open(files[k], "r", "utf-8")
    for file in f:
        array += file
        arr = array.split("}{")
    arr[0] = arr[0][1:]
    arr[-1] = arr[-1][:-1]
    for i in range(len(arr)):
        arr[i] = "{" + arr[i] + "}"
    data = []
    for i in range(len(arr)):
        data.append(list(eval(arr[i]).values())[0])
    ser_buf = pd.Series(data)
    df[features[k]] = ser_buf

Wall time: 12.5 s


In [61]:
%%time

f = codecs.open(files[0], "r", "utf-8")
array = ''
arr = []
for file in f:
    array += file
    arr += array.split("}{")
arr[0] = arr[0][1:]
arr[-1] = arr[-1][:-1]

X_train_counts = count_vect.fit_transform(df.title)


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

KeyboardInterrupt: 

## Creating Tf-Idf with article titles (ngram_range=(1, 2), max_features=100000 but you can try adding more)

In [28]:
df.head()

Unnamed: 0,published,title,author
0,{'$date': '2012-08-13T22:54:53.510Z'},Medium Terms of Service – Medium Policy – Medium,"{'name': None, 'url': 'https://medium.com/@Med..."
1,{'$date': '2015-08-03T07:44:50.331Z'},Amendment to Medium Terms of Service Applicabl...,"{'name': None, 'url': 'https://medium.com/@Med..."
2,{'$date': '2017-02-05T13:08:17.410Z'},走入山與海之間：閩東大刀會和兩岸走私 – Yun-Chen Chien（簡韻真） – Medium,"{'name': None, 'url': 'https://medium.com/@ael..."
3,{'$date': '2017-05-06T08:16:30.776Z'},How fast can a camera get? – What comes to min...,"{'name': None, 'url': 'https://medium.com/@vai..."
4,{'$date': '2017-06-04T14:46:25.772Z'},A game for the lonely fox – What comes to mind...,"{'name': None, 'url': 'https://medium.com/@vai..."


In [53]:
count_vect = CountVectorizer(ngram_range=(1, 2), max_features= 100000)
X_train_counts = count_vect.fit_transform(df.title)
X_train_counts.shape

(62313, 100000)

In [31]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(62313, 66033)

In [32]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(62313, 66033)

In [None]:
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))
    
text_clf = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                   ('clf', MultinomialNB()),
])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
twenty_test = fetch_20newsgroups(subset='test',
     categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)    

text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, n_iter=5, random_state=42)),
    ])
_ = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)