In [71]:
import os
import json
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import Ridge

In [72]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [73]:
def read_json_line(line=None):
    result = None
    try:        
        result = json.loads(line)
    except Exception as e:      
        # Find the offending character index:
        idx_to_replace = int(str(e).split(' ')[-1].replace(')',''))      
        # Remove the offending character:
        new_line = list(line)
        new_line[idx_to_replace] = ' '
        new_line = ''.join(new_line)     
        return read_json_line(line=new_line)
    return result

In [87]:
def extract_features_and_write(path_to_data,
                               inp_filename, is_train=True):
    ident = 0
    features = ['content', 'published', 'title', 'author']
    prefix = 'train' if is_train else 'test'
    feature_files = [open(os.path.join(path_to_data,
                                       '{}_{}.txt'.format(prefix, feat)),
                          'w', encoding="utf-8")
                     for feat in features]
    feature_light_files = [open(os.path.join(path_to_data,
                                       '{}_{}_light.txt'.format(prefix, feat)),
                          'w', encoding="utf-8")                     
                     for feat in features]
    
    with open(os.path.join(path_to_data, inp_filename), 
              encoding='utf-8') as inp_json_file:
    
        for line in tqdm_notebook(inp_json_file):
            dict_data = {}
            json_data = read_json_line(line)
            for i in range(len(feature_files)):
                dict_data[ident] = json_data[features[i]]
                feature_files[i].write(str(dict_data))
                if (ident < 5000):
                    feature_light_files[i].write(str(dict_data))
            ident += 1

In [88]:
PATH_TO_DATA = 'C:\mlcourse\Databases\kaggle_medium'

In [89]:
extract_features_and_write(PATH_TO_DATA, 'train.json', is_train=True)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [90]:
extract_features_and_write(PATH_TO_DATA, 'test.json', is_train=False)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Add the following groups of features:

- Tf-Idf with article content (ngram_range=(1, 2), max_features=100000 but you can try adding more)
- Tf-Idf with article titles (ngram_range=(1, 2), max_features=100000 but you can try adding more)
- Time features: publication hour, whether it's morning, day, night, whether it's a weekend
- Bag of authors (i.e. One-Hot-Encoded author names)

In [98]:
features = ['content', 'published', 'title', 'author']
prefix = ['train', 'test']
files = []
for i in prefix:
   [ files.append(open(os.path.join(PATH_TO_DATA,
                                       '{}_{}_light.txt'.format(i, feat)),
                          'r', encoding="utf-8"))
                     for feat in features]

In [None]:
df = {} // will be dataframe
with open(os.path.join(files), 
              encoding='utf-8') as inp_file:
        for line in tqdm_notebook(inp_file):
            for i in range(len(features)):
                df[features[i]] = 

In [99]:
files

[<_io.TextIOWrapper name='C:\\mlcourse\\Databases\\kaggle_medium\\train_content_light.txt' mode='r' encoding='utf-8'>,
 <_io.TextIOWrapper name='C:\\mlcourse\\Databases\\kaggle_medium\\train_published_light.txt' mode='r' encoding='utf-8'>,
 <_io.TextIOWrapper name='C:\\mlcourse\\Databases\\kaggle_medium\\train_title_light.txt' mode='r' encoding='utf-8'>,
 <_io.TextIOWrapper name='C:\\mlcourse\\Databases\\kaggle_medium\\train_author_light.txt' mode='r' encoding='utf-8'>,
 <_io.TextIOWrapper name='C:\\mlcourse\\Databases\\kaggle_medium\\test_content_light.txt' mode='r' encoding='utf-8'>,
 <_io.TextIOWrapper name='C:\\mlcourse\\Databases\\kaggle_medium\\test_published_light.txt' mode='r' encoding='utf-8'>,
 <_io.TextIOWrapper name='C:\\mlcourse\\Databases\\kaggle_medium\\test_title_light.txt' mode='r' encoding='utf-8'>,
 <_io.TextIOWrapper name='C:\\mlcourse\\Databases\\kaggle_medium\\test_author_light.txt' mode='r' encoding='utf-8'>]