In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import re
import logging
import matplotlib.pyplot as plt
import re
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import pickle
import sklearn
import numpy as np
import logging
from sklearn import svm


df=pd.read_json(r"/content/drive/My Drive/ML codes/NLP text scan/News_Category_Dataset_v2.json",lines=True)

df=df[0:5000]
def tokenize_url(url:str):   
    url=url.replace("https://www.huffingtonpost.com/entry/","")
    url=re.sub("(\W|_)+"," ",url)
    return url

df['tokenized_url']=df['link'].apply(lambda x:tokenize_url(x))

#just the description
df['text_desc'] = df['short_description']

#description + headline
df['text_desc_headline'] = df['short_description'] + ' '+ df['headline']

#description + headline + tokenized url
df['text_desc_headline_url'] = df['short_description'] + ' '+ df['headline']+" " + df['tokenized_url']

def _reciprocal_rank(true_labels: list, machine_preds: list):
    """Compute the reciprocal rank at cutoff k"""
    
    # add index to list only if machine predicted label exists in true labels
    tp_pos_list = [(idx + 1) for idx, r in enumerate(machine_preds) if r in true_labels]

    rr = 0
    if len(tp_pos_list) > 0:
        # for RR we need position of first correct item
        first_pos_list = tp_pos_list[0]
        
        # rr = 1/rank
        rr = 1 / float(first_pos_list)

    return rr

def compute_mrr_at_k(items:list):
    """Compute the MRR (average RR) at cutoff k"""
    rr_total = 0
    
    for item in items:   
        rr_at_k = _reciprocal_rank(item[0],item[1])
        rr_total = rr_total + rr_at_k
        mrr = rr_total / 1/float(len(items))

    return mrr

def collect_preds(Y_test,Y_preds):
    """Collect all predictions and ground truth"""
    
    pred_gold_list=[[[Y_test[idx]],pred] for idx,pred in enumerate(Y_preds)]
    return pred_gold_list
             
def compute_accuracy(eval_items:list):
    correct=0
    total=0
    
    for item in eval_items:
        true_pred=item[0]
        machine_pred=set(item[1])
        
        for cat in true_pred:
            if cat in machine_pred:
                correct+=1
                break
    print("\n\n\ncorrect = ",correct)
    accuracy=correct/float(len(eval_items))
    return accuracy

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def extract_features(df,field,training_data,testing_data,type="binary"):
    """Extract features using different methods"""
    
    logging.info("Extracting features and creating vocabulary...")
    
    if "binary" in type:
        
        # BINARY FEATURE REPRESENTATION
        cv= CountVectorizer(binary=True, max_df=0.95)
        cv.fit_transform(training_data[field].values)
        train_feature_set=cv.transform(training_data[field].values)
        test_feature_set=cv.transform(testing_data[field].values)
        # print("\n\n\n  train_feature_set = ", train_feature_set)
        return train_feature_set,test_feature_set,cv
  
    elif "counts" in type:
        
        # COUNT BASED FEATURE REPRESENTATION
        cv= CountVectorizer(binary=False, max_df=0.95)
        cv.fit_transform(training_data[field].values)
        
        train_feature_set=cv.transform(training_data[field].values)
        test_feature_set=cv.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,cv
    
    else:    
        
        # TF-IDF BASED FEATURE REPRESENTATION
        tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95)
        tfidf_vectorizer.fit_transform(training_data[field].values)
        
        train_feature_set=tfidf_vectorizer.transform(training_data[field].values)
        test_feature_set=tfidf_vectorizer.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,tfidf_vectorizer

def get_top_k_predictions(model,X_test,k):
    
    # get probabilities instead of predicted labels, since we want to collect top 3
    probs = model.predict_proba(X_test)

    # GET TOP K PREDICTIONS BY PROB - note these are just index
    best_n = np.argsort(probs, axis=1)[:,-k:]
    
    # GET CATEGORY OF PREDICTIONS
    preds=[[model.classes_[predicted_cat] for predicted_cat in prediction] for prediction in best_n]
    
    preds=[ item[::-1] for item in preds]
    
    return preds
   





def train_model(df,field="text_desc",feature_rep="binary",top_k=3):
    
    logging.info("Starting model training...")
    
    # GET A TRAIN TEST SPLIT (set seed for consistent results)
    training_data, testing_data = train_test_split(df,random_state = 200,)
    #print("\n\n\n training_data = ",training_data)
    # GET LABELS
    Y_train=training_data['category'].values
    Y_test=testing_data['category'].values
   
    # GET FEATURES
    X_train,X_test,feature_transformer=extract_features(df,field,training_data,testing_data,type=feature_rep)
    # print("\n\n\\nY X train shape = ",(X_train))
    # print("\n\n\\nY X test shape = ",(X_test))
    
    # INIT LOGISTIC REGRESSION CLASSIFIER
    logging.info("Training a SVM  Model...")
    clf = svm.SVC(probability=True,kernel='linear') # Linear Kernel

    #scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
    model=clf.fit(X_train, Y_train)
    
    
    # GET TOP K PREDICTIONS
    preds=get_top_k_predictions(model,X_test,top_k)
    
    # GET PREDICTED VALUES AND GROUND TRUTH INTO A LIST OF LISTS - for ease of evaluation
    eval_items=collect_preds(Y_test,preds)
    
    # GET EVALUATION NUMBERS ON TEST SET -- HOW DID WE DO?
    logging.info("Starting evaluation...")
    accuracy=compute_accuracy(eval_items)
    mrr_at_k=compute_mrr_at_k(eval_items)
    # print("\n\n\nEVAL ITEMS = ",eval_items)
    
    logging.info("Done training and evaluation.")
    #s=LogisticRegression.score(Y_test, preds,y=None)
    #print("\n\n\ns =",s)
    
    return model,feature_transformer,accuracy,mrr_at_k


field='text_desc'
feature_rep='binary'
top_k=3

model,transformer,accuracy,mrr_at_k=train_model(df,field=field,feature_rep=feature_rep,top_k=top_k)


#model_path="/content/drive/My Drive/ML codes/NLP text scan/model.pkl"
#transformer_path="/content/drive/My Drive/ML codes/NLP text scan/transformer.pkl"

# pickle.dump(model,open(model_path, 'wb'))
# pickle.dump(transformer,open(transformer_path,'wb'))


# loaded_model = pickle.load(open(model_path, 'rb'))
# loaded_transformer = pickle.load(open(transformer_path, 'rb'))

# test_features=transformer.transform(["True Thompson makes an adorable cameo in Khloe Kardashian's new makeup tutorial video"])

print("\nAccuracy={0}; MRR={1}".format(accuracy,mrr_at_k))
# print(get_top_k_predictions(loaded_model,test_features,2))


#print(sklearn.metrics.confusion_matr ix(test_features, pre, labels=None, sample_weight=None, normalize=None))
#print("acc=",sklearn.metrics.accuracy_score(test_features, pre, normalize=True, sample_weight=None))







2020-04-27 12:31:44,773 : INFO : Starting model training...
2020-04-27 12:31:44,836 : INFO : Extracting features and creating vocabulary...
2020-04-27 12:31:44,969 : INFO : Training a SVM  Model...
2020-04-27 12:32:13,447 : INFO : Starting evaluation...
2020-04-27 12:32:13,451 : INFO : Done training and evaluation.





correct =  939

Accuracy=0.7512; MRR=0.6217333333333339


In [9]:
test_features=transformer.transform(["Democrats Win Special Election In Missouri District That Went Big For Trump"])
pre=(get_top_k_predictions(model,test_features,2))
pre

[['POLITICS', 'MEDIA']]

In [0]:
d=pd.read_json(r"/content/drive/My Drive/ML codes/NLP text scan/News_Category_Dataset_v2.json",lines=True)

s=d['headline']


In [4]:
s[6100]

'Democrats Win Special Election In Missouri District That Went Big For Trump'

In [5]:
s[9004]

'Philippine Ferry Carrying 251 Capsizes In Storm'

In [6]:
s[6020]

'Volunteer Gymnastics Coach Charged With Child Molestation'