In [1]:
import pandas as pd
import sklearn 
import numpy as np
import nltk
#nltk.download('punkt')
import re
import time
import codecs

In [2]:
# import data using pandas and put into SFrames:
papers_data = pd.read_csv('Papers.csv')
authors_data = pd.read_csv('Authors.csv')
authorId_data = pd.read_csv('PaperAuthors.csv')

In [3]:
def given_paperID_give_index(paper_id, paper_data):
    return paper_data[paper_data['Id']==paper_id].index[0]
#
def given_index_give_PaperID(index, paper_data):
    return paper_data.iloc[index]['Id']

In [4]:
Ex_paper_id = 5941
Ex_paper_index = given_paperID_give_index(Ex_paper_id, papers_data)
papers_data.iloc[Ex_paper_index]['PaperText'][0:1000]

'Learning with Symmetric Label Noise: The\nImportance of Being Unhinged\n\nBrendan van Rooyen\xe2\x88\x97,\xe2\x80\xa0\n\xe2\x88\x97\n\nAditya Krishna Menon\xe2\x80\xa0,\xe2\x88\x97\n\nThe Australian National University\n\n\xe2\x80\xa0\n\nRobert C. Williamson\xe2\x88\x97,\xe2\x80\xa0\n\nNational ICT Australia\n\n{ brendan.vanrooyen, aditya.menon, bob.williamson }@nicta.com.au\n\nAbstract\nConvex potential minimisation is the de facto approach to binary classification.\nHowever, Long and Servedio [2010] proved that under symmetric label noise\n(SLN), minimisation of any convex potential over a linear function class can result in classification performance equivalent to random guessing. This ostensibly\nshows that convex losses are not SLN-robust. In this paper, we propose a convex,\nclassification-calibrated loss and prove that it is SLN-robust. The loss avoids the\nLong and Servedio [2010] result by virtue of being negatively unbounded. The\nloss is a modification of the hinge loss, wh

In [5]:
def clean_text(text):
    list_of_cleaning_signs = ['\x0c', '\n']
    for sign in list_of_cleaning_signs:
        text = text.replace(sign, ' ')
    #text = unicode(text, errors='ignore')
    clean_text = re.sub('[^a-zA-Z]+', ' ', text)
    return clean_text.lower()

In [6]:
papers_data['PaperText_clean'] = papers_data['PaperText'].apply(lambda x: clean_text(x))
papers_data['Abstract_clean'] = papers_data['Abstract'].apply(lambda x: clean_text(x))

In [7]:
papers_data.iloc[Ex_paper_index]['PaperText_clean'][0:1000]

'learning with symmetric label noise the importance of being unhinged brendan van rooyen aditya krishna menon the australian national university robert c williamson national ict australia brendan vanrooyen aditya menon bob williamson nicta com au abstract convex potential minimisation is the de facto approach to binary classification however long and servedio proved that under symmetric label noise sln minimisation of any convex potential over a linear function class can result in classification performance equivalent to random guessing this ostensibly shows that convex losses are not sln robust in this paper we propose a convex classification calibrated loss and prove that it is sln robust the loss avoids the long and servedio result by virtue of being negatively unbounded the loss is a modification of the hinge loss where one does not clamp at zero hence we call it the unhinged loss we show that the optimal unhinged solution is equivalent to that of a strongly regularised svm and is 

In [8]:
# here Brandon defines a tokenizer and stemmer which returns the set 
# of stems in the text that it is passed
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Producing tf_idf matrix separately based on Abstract
tfidf_vectorizer_Abstract = TfidfVectorizer(max_df=0.95, max_features=200000,
                                 min_df=0.05, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))
%time tfidf_matrix_Abstract = tfidf_vectorizer_Abstract.fit_transform(papers_data['Abstract_clean'])

# Producing tf_idf matrix separately based on PaperText
tfidf_vectorizer_PaperText = TfidfVectorizer(max_df=0.9, max_features=200000,
                                 min_df=0.1, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))
%time tfidf_matrix_PaperText = tfidf_vectorizer_PaperText.fit_transform(papers_data['PaperText_clean'])

CPU times: user 1.45 s, sys: 8.06 ms, total: 1.45 s
Wall time: 1.51 s
CPU times: user 42.9 s, sys: 414 ms, total: 43.3 s
Wall time: 43.2 s


In [10]:
terms_Abstract = tfidf_vectorizer_Abstract.get_feature_names()
terms_PaperText = tfidf_vectorizer_Abstract.get_feature_names()

In [11]:
def top_tfidf_feats(row, terms, top_n=25):
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(terms[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df['feature']
def given_paperID_give_keywords(paper_data, tfidfMatrix, terms, paper_id, top_n=20):
    row_id = given_paperID_give_index(paper_id, paper_data)
    row = np.squeeze(tfidfMatrix[row_id].toarray())
    return top_tfidf_feats(row, terms, top_n)

In [12]:
paper_id_example = 5941
print ("Keywords based on Abstract:")
print (given_paperID_give_keywords(papers_data, tfidf_matrix_Abstract,
                                  terms_Abstract, paper_id_example, top_n = 10))

Keywords based on Abstract:
0            loss
1          convex
2          robust
3         classif
4          strong
5           solut
6           prove
7             ani
8    paper propos
9          result
Name: feature, dtype: object


In [13]:
from sklearn.neighbors import NearestNeighbors
# Based on Abstract
num_neighbors = 4
nbrs_Abstract = NearestNeighbors(n_neighbors=num_neighbors,
                                 algorithm='auto').fit(tfidf_matrix_Abstract)
distances_Abstract, indices_Abstract = nbrs_Abstract.kneighbors(tfidf_matrix_Abstract)
# Based on PaperText
nbrs_PaperText = NearestNeighbors(n_neighbors=num_neighbors,
                                  algorithm='auto').fit(tfidf_matrix_PaperText)
distances_PaperText, indices_PaperText = nbrs_PaperText.kneighbors(tfidf_matrix_PaperText)

In [14]:
print ("Nbrs of the example paper based on Abstract similarity: %r" % indices_Abstract[1])
print ("Nbrs of the example paper based on PaperText similarity: %r" % indices_PaperText[1])

Nbrs of the example paper based on Abstract similarity: array([  1,  87, 301, 112])
Nbrs of the example paper based on PaperText similarity: array([  1, 125, 112, 148])


In [15]:
Ex_paper_id = 5941
Ex_index = given_paperID_give_index(Ex_paper_id, papers_data)
print ("The Abstract of the example paper is:\n")
print (papers_data.iloc[indices_Abstract[Ex_index][0]]['Abstract'])
print ("The Abstract of the similar papers are:\n")
for i in range(1, len(indices_Abstract[Ex_index])):
    print ("Neighbor No. %r has following abstract: \n" % i)
    print (papers_data.iloc[indices_Abstract[Ex_index][i]]['Abstract'])
    print ("\n")

The Abstract of the example paper is:

Convex potential minimisation is the de facto approach to binary classification. However, Long and Servedio [2008] proved that under symmetric label noise (SLN), minimisation of any convex potential over a linear function class can result in classification performance equivalent to random guessing. This ostensibly shows that convex losses are not SLN-robust. In this paper, we propose a convex, classification-calibrated loss and prove that it is SLN-robust. The loss avoids the Long and Servedio [2008] result by virtue of being negatively unbounded. The loss is a modification of the hinge loss, where one does not clamp at zero; hence, we call it the unhinged loss. We show that the optimal unhinged solution is equivalent to that of a strongly regularised SVM, and is the limiting solution for any convex potential; this implies that strong l2 regularisation makes most standard learners SLN-robust. Experiments confirm the unhinged loss’ SLN-robustness.
