In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [26]:
df = pd.read_json("news.json", lines=True)

In [27]:
import re
def tokenize_url(url:str):
  url = url.replace("https://www.huffingtonpost.com/entry/", "")
  url=re.sub("(\W|_)+", " ", url)
  return url

df["tokenized_url"] = df['link'].apply(lambda x:tokenize_url(x))

df["text_desc"] = df["short_description"]

df["tex_desc_headline"] = df["short_description"] + " " + df["headline"]

df["text_desc_headline_url"] = df["short_description"] + " " + df["headline"] + " " + df["tokenized_url"]


In [28]:
def _reciprocal_rank(true_labels: list, machine_preds: list):
    """Compute the reciprocal rank at cutoff k"""
    
    # add index to list only if machine predicted label exists in true labels
    tp_pos_list = [(idx + 1) for idx, r in enumerate(machine_preds) if r in true_labels]

    rr = 0
    if len(tp_pos_list) > 0:
        # for RR we need position of first correct item
        first_pos_list = tp_pos_list[0]
        
        # rr = 1/rank
        rr = 1 / float(first_pos_list)

    return rr

def compute_mrr_at_k(items:list):
    """Compute the MRR (average RR) at cutoff k"""
    rr_total = 0
    
    for item in items:   
        rr_at_k = _reciprocal_rank(item[0],item[1])
        rr_total = rr_total + rr_at_k
        mrr = rr_total / 1/float(len(items))

    return mrr

def collect_preds(Y_test,Y_preds):
    """Collect all predictions and ground truth"""
    
    pred_gold_list=[[[Y_test[idx]],pred] for idx,pred in enumerate(Y_preds)]
    return pred_gold_list
             
def compute_accuracy(eval_items:list):
    correct=0
    total=0
    
    for item in eval_items:
        true_pred=item[0]
        machine_pred=set(item[1])
        
        for cat in true_pred:
            if cat in machine_pred:
                correct+=1
                break
    
    
    accuracy=correct/float(len(eval_items))
    return accuracy

In [29]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [30]:
import numpy as np

In [31]:
def extract_features(df, field, training_data, testing_data, type="binary"):
  if "binary" in type:
    cv = CountVectorizer(binary=True, max_df=0.95)
    cv.fit_transform(training_data[field].values)

    train_feature_set = cv.transform(training_data[field].values)
    test_feature_set = cv.transform(testing_data[field].values)

    return train_feature_set, test_feature_set, cv

  elif "counts" in type:
    cv = CountVectorizer(binary=False, max_df=0.95)
    cv.fit_transform(training_data[field].values)

    train_feature_set = cv.transform(training_data[field].values)
    test_feature_set = cv.transform(testing_data[field].values)

    return train_feature_set, test_feature_set, cv

  else:
    tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_df=0.95)
    tfidf_vectorizer.fit_transform(training_data[field].values)

    train_feature_set = tfidf_vectorizer.transform(training_data[field].values)
    test_feature_set = tfidf_vectorizer.transform(testing_data[field].values)

    return train_feature_set, test_feature_set, cv

In [32]:
def get_top_k_predictions(model, X_test, k):
  probs = model.predict_proba(X_test)

  # GET TOP K PREDICTIONS BY PROB - note these are just index
  best_n = np.argsort(probs, axis=1)[:,-k:]
  
  # GET CATEGORY OF PREDICTIONS
  preds=[[model.classes_[predicted_cat] for predicted_cat in prediction] for prediction in best_n]
  
  preds=[ item[::-1] for item in preds]
  
  return preds

In [33]:
def train_model(df,field="text_desc",feature_rep="binary",top_k=3):
    
    # GET A TRAIN TEST SPLIT (set seed for consistent results)
    training_data, testing_data = train_test_split(df,random_state = 2000,)

    # GET LABELS
    Y_train=training_data['category'].values
    Y_test=testing_data['category'].values
     
    # GET FEATURES
    X_train,X_test,feature_transformer=extract_features(df,field,training_data,testing_data,type=feature_rep)

    # INIT LOGISTIC REGRESSION CLASSIFIER
    scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
    model=scikit_log_reg.fit(X_train,Y_train)

    # GET TOP K PREDICTIONS
    preds=get_top_k_predictions(model,X_test,top_k)
    
    # GET PREDICTED VALUES AND GROUND TRUTH INTO A LIST OF LISTS - for ease of evaluation
    eval_items=collect_preds(Y_test,preds)
    
    accuracy=compute_accuracy(eval_items)
    mrr_at_k=compute_mrr_at_k(eval_items)
    
    return model,feature_transformer,accuracy,mrr_at_k

In [34]:
field='text_desc'
feature_rep='binary'
top_k=3

model,transformer,accuracy,mrr_at_k=train_model(df,field=field,feature_rep=feature_rep,top_k=top_k)
print("\nAccuracy={0}; MRR={1}".format(accuracy,mrr_at_k))

[LibLinear]
Accuracy=0.601990701329317; MRR=0.48291969528301365
