Competition on Kaggle: https://www.kaggle.com/c/how-good-is-your-medium-article/leaderboard

In [1]:
!pip install tqdm



distributed 1.21.8 requires msgpack, which is not installed.
You are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [1]:
import os
import codecs
import json
import seaborn as sns
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
from scipy.sparse import csr_matrix, hstack

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import LassoLarsCV

from sklearn.linear_model import Ridge
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from datetime import date



In [2]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ' '.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [3]:
def read_json_line(line=None):
    result = None
    try:        
        result = json.loads(line)
    except Exception as e:      
        # Find the offending character index:
        idx_to_replace = int(str(e).split(' ')[-1].replace(')',''))      
        # Remove the offending character:
        new_line = list(line)
        new_line[idx_to_replace] = ' '
        new_line = ''.join(new_line)     
        return read_json_line(line=new_line)
    return result

In [4]:
def extract_features_and_write(path_to_data,
                               inp_filename, is_train=True):
    ident = 0
    features = ['_id', 'content', 'published', 'title', 'author']
    prefix = 'train' if is_train else 'test'
    feature_files = [open(os.path.join(path_to_data,
                                       '{}_{}.txt'.format(prefix, feat)),
                          'w', encoding="utf-8")
                     for feat in features]
 
    with open(os.path.join(path_to_data, inp_filename), 
              encoding='utf-8') as inp_json_file:
    
        for line in tqdm_notebook(inp_json_file):
            dict_data = {}
            json_data = read_json_line(line)
            for i in range(len(feature_files)):
                dict_data[ident] = json_data[features[i]]
                feature_files[i].write(str(dict_data) + '\n')
            ident += 1
    for i in feature_files:
        i.close()

In [5]:
PATH_TO_DATA = 'C:\mlcourse\Databases\kaggle_medium'

In [6]:
extract_features_and_write(PATH_TO_DATA, 'train.json', is_train=True)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [7]:
extract_features_and_write(PATH_TO_DATA, 'test.json', is_train=False)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




## Add the following groups of features:

- Tf-Idf with article content (ngram_range=(1, 2), max_features=100000 but you can try adding more)
- Tf-Idf with article titles (ngram_range=(1, 2), max_features=100000 but you can try adding more)
- Time features: publication hour, whether it's morning, day, night, whether it's a weekend
- Bag of authors (i.e. One-Hot-Encoded author names)

#### Extacting data

In [8]:
features = ['_id', 'content', 'published', 'title', 'author']

train_files = [os.path.join(PATH_TO_DATA,
                                       '{}_{}.txt'.format('train', feat))   
                     for feat in features]

test_files = [os.path.join(PATH_TO_DATA,
                                       '{}_{}.txt'.format('test', feat))   
                     for feat in features]

In [None]:
%%time
df_all_data = pd.DataFrame({})
for k in range(0, len(features)):
    f = codecs.open(train_files[k], "r", "utf-8")
    arr = []
    count = 0
    for line in f:
        count += 1
        arr.append(list(eval(line[:-2]).values())[0])
    f.close()
    ser_buf = pd.Series(arr)
    df_all_data[features[k]] = ser_buf

In [None]:
%%time
df_all_test = pd.DataFrame({})
for k in range(0, len(features)):
    f = codecs.open(test_files[k], "r", "utf-8")
    arr = []
    count = 0
    for line in f:
        count += 1
        arr.append(list(eval(line[:-2]).values())[0])
    f.close()
    ser_buf = pd.Series(arr)
    df_all_test[features[k]] = ser_buf

In [None]:
df_all_data['content'] = df_all_data['content'].map(strip_tags)
df_all_test['content'] = df_all_test['content'].map(strip_tags)


Adding a popularity column

In [None]:
data = pd.read_csv('C:\mlcourse\Databases\kaggle_medium/train_log1p_recommends.csv', sep=',')

In [None]:
df_all_data = df_all_data.join(data)

## Strip content: \xa0, \u200b

In [None]:
dddata = df_all_data.copy(deep=True)
dddata_test = df_all_test.copy(deep=True)

In [None]:
df_all_data = dddata.copy(deep=True)

## Division publications by hour, whether it's morning, day, night, whether it's a weekend


In [None]:
df_all_data.head()

#### Creating the columns date and time and preprocessing author

In [None]:
def pub_to_time(publicate):
    return publicate[11:-5]

def pub_to_date(publicate):
    return publicate[:10] 

def time_to_sec(time_s):
    return ((int(time_s[:2])* 60 * 60) + (int(time_s[3:5])) * 60 + (int(time_s[6:])))

# may be needed other division
def sec_to_tofd(time_sec):
    x = 3600
    if(time_sec > 18 * x):
        return 'E'
    elif(time_sec > 12 * x):
        return 'D'
    elif(time_sec > 6 * x):
        return 'M'
    else:
        return 'N'
    
def day_of_week(d):
    days = date(int(d[:4]), int(d[5:7]), int(d[8:]))
    return(days.weekday())

def is_weekend(d):
    if (d > 4):
        return True
    return False

def twitter_data(author_data):
    return author_data['twitter']

def url_data(author_data):
    return author_data['url']

def publish_to_str(publ):
    return list(publ.values())[0]

In [None]:
df_all_data['published'] = df_all_data['published'].map(publish_to_str)
df_all_data['date'] = df_all_data['published'].map(pub_to_date)
df_all_data['time'] = df_all_data['published'].map(pub_to_time)
df_all_data['time_sec'] = df_all_data['time'].map(time_to_sec)
df_all_data['time_of_day'] = df_all_data['time_sec'].map(sec_to_tofd)
df_all_data['week_day'] = df_all_data['date'].map(day_of_week)
df_all_data['weekend'] = df_all_data['week_day'].map(is_weekend)
df_all_data['twitter'] = df_all_data['author'].map(twitter_data)
df_all_data['url'] = df_all_data['author'].map(url_data)
df_all_data['cont_len']= df_all_data['content'].map(len)
df_all_data['title_len']= df_all_data['title'].map(len)

df_all_test['published'] = df_all_test['published'].map(publish_to_str)
df_all_test['date'] = df_all_test['published'].map(pub_to_date)
df_all_test['time'] = df_all_test['published'].map(pub_to_time)
df_all_test['time_sec'] = df_all_test['time'].map(time_to_sec)
df_all_test['time_of_day'] = df_all_test['time_sec'].map(sec_to_tofd)
df_all_test['week_day'] = df_all_test['date'].map(day_of_week)
df_all_test['weekend'] = df_all_test['week_day'].map(is_weekend)
df_all_test['twitter'] = df_all_test['author'].map(twitter_data)
df_all_test['url'] = df_all_test['author'].map(url_data)
df_all_test['cont_len']= df_all_test['content'].map(len)
df_all_test['title_len']= df_all_test['title'].map(len)

In [None]:
# Not here
feats = ['time_of_day', 'week_day','cont_len', 'title_len', 'log_recommends']
df = df_all_data[feats]
df_test = df_all_test[feats[:-1]]
#X_train, y_train = train_test_split(df, random_state=17)

In [None]:
df.head()

#### Creating 'Bag of authors'

In [None]:
label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse=False)
labels_authors = label_encoder.fit(df_all_data['url'])
df_all_data['url'] = label_encoder.transform(df_all_data['url'])
df_all_test['url'] = label_encoder.transform(df_all_test['url'])


In [102]:
author_encoded_cols_test = pd.DataFrame(onehot_encoder.fit_transform((df_all_test['url'].values).reshape(-1, 1)))

MemoryError: 

In [99]:
author_encoded_categorical_columns = pd.DataFrame()

In [98]:
author_encoded_categorical_columns = pd.DataFrame(onehot_encoder.fit_transform((df_all_data['url'].values).reshape(-1, 1)))

MemoryError: 

In [100]:
author_encoded_cols_test.shape, author_encoded_categorical_columns.shape

((34645, 18842), (0, 0))

#### Creating 'Bag of time and week days'

In [47]:
for i in feats[:2]:
    df[i] = label_encoder.fit_transform(df[i])
data_time_train = pd.DataFrame(onehot_encoder.fit_transform((df[feats[:2]])))
data_train = data_time_train.join(df[feats[2:-1]])

for i in feats[:2]:
    df_test[i] = label_encoder.fit_transform(df_test[i])
data_time_test = pd.DataFrame(onehot_encoder.fit_transform((df_test[feats[:2]])))
data_test = data_time_train.join(df_test[feats[2:-1]])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [50]:
data_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,cont_len,title_len
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,7007,48
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,10817,72
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4588,49
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4436,56
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5474,55


In [51]:
data_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,cont_len,title_len
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,12580.0,55.0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6759.0,49.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7337.0,42.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,13027.0,58.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5723.0,47.0


## Creating Tf-Idf 

- with article content (ngram_range=(1, 2), max_features=100000 but you can try adding more)
- with article titles (ngram_range=(1, 2), max_features=100000 but you can try adding more)


### train data

In [52]:
%%time

count_vect = CountVectorizer(ngram_range=(1, 2), max_features= 100000)
df_counts_title = count_vect.fit_transform(df_all_data.title)

tf_transformer_title = TfidfTransformer(use_idf=False).fit(df_counts_title)
df_tf_title = tf_transformer_title.transform(df_counts_title)

tfidf_transformer = TfidfTransformer()
df_tfidf_title = tfidf_transformer.fit_transform(df_counts_title)

Wall time: 9.6 s


In [53]:
%%time

count_vect = CountVectorizer(ngram_range=(1, 2), max_features= 100000)
df_counts_content = count_vect.fit_transform(df_all_data.content)

#tf_transformer_content = TfidfTransformer(use_idf=False).fit(df_counts_content)
#df_tf_content = tf_transformer_content.transform(df_counts_content)

tfidf_transformer = TfidfTransformer()
df_tfidf_content = tfidf_transformer.fit_transform(df_counts_content)

Wall time: 1h 4min 58s


### test data

In [54]:
%%time

count_vect = CountVectorizer(ngram_range=(1, 2), max_features= 100000)
df_counts_title_t = count_vect.fit_transform(df_all_test.title)

tf_transformer_title = TfidfTransformer(use_idf=False).fit(df_counts_title_t)
df_tf_title_te = tf_transformer_title.transform(df_counts_title_t)

tfidf_transformer = TfidfTransformer()
df_tfidf_title_te = tfidf_transformer.fit_transform(df_counts_title_t)

Wall time: 45.8 s


In [55]:
%%time

count_vect = CountVectorizer(ngram_range=(1, 2), max_features= 100000)
df_counts_content_t = count_vect.fit_transform(df_all_test.content)

tfidf_transformer = TfidfTransformer()
df_tfidf_content_te = tfidf_transformer.fit_transform(df_counts_content_t)

Wall time: 17min 52s


In [None]:
df_tfidf_content.shape, df_tfidf_title.shape, author_encoded_categorical_columns.shape

## Sparsing train and test data

In [None]:
%%time
X_train_sparse = hstack([author_encoded_categorical_columns,
                         df_tfidf_content, df_tfidf_title, data_train]).tocsr()

In [None]:
%%time
X_test_sparse = hstack([author_encoded_cols_test,
                         df_tfidf_content_te, df_tfidf_title_te, data_test]).tocsr()

In [None]:
y_train = df[feats[-1]]

In [None]:
X_train_sparse.shape, len(df[feats[-1]]), X_test_sparse.shape

 Split data on train and validation sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train_sparse, df[feats[-1]], random_state=17)

In [104]:
train_part_size = int(0.7 * y_train.shape[0])
X_train_part_sparse = X_train_sparse[:train_part_size, :]
y_train_part = y_train[:train_part_size]
X_valid_sparse =  X_train_sparse[train_part_size:, :]
y_valid = y_train[train_part_size:]

Using different models and check MAE on validation set

In [None]:
def mean_err(arr1, arr2, p = 1):
    if (len(arr1) == len(arr2)):
        sum = 0
        for i in range(len(arr1)):
            sum += (abs(arr1[i] - arr2[i])) ** p
        return ((sum ** (1 / p)) / len(arr1))
    return(0)

Ridge model

In [None]:
%%time
for i in range(3, 6):
    ridge = Ridge(alpha = i / 4)
    ridge.fit(X_train, y_train)
    print("alpha: ", i / 4, " MAE: ", ridge.score(X_test, y_test), mean_err(ridge.predict(X_test), y_test.values))

In [None]:
def write_submission_file(prediction, filename,
                          path_to_sample=os.path.join(PATH_TO_DATA, 
                                                      'sample_submission.csv')):
    submission = pd.read_csv(path_to_sample, index_col='id')
    
    submission['log_recommends'] = prediction
    submission.to_csv(filename)

In [None]:
write_submission_file(ridge_test_pred, os.path.join(PATH_TO_DATA,
                                                    'assignment6_medium_submission.csv'))

dirty Kaggle hacks. Form a submission file with all zeros. Make a submission. What do you get if you think about it? How is it going to help you with modifying your predictions?

In [None]:
write_submission_file(np.zeros_like(ridge_test_pred), 
                      os.path.join(PATH_TO_DATA,
                                   'medium_all_zeros_submission.csv'))

Modify predictions in an appropriate way (based on your all-zero submission) and make a new submission.

In [None]:
ridge_test_pred_modif = ridge_test_pred # You code here


In [None]:
write_submission_file(ridge_test_pred_modif, 
                      os.path.join(PATH_TO_DATA,
                                   'assignment6_medium_submission_with_hack.csv'))
