In [2]:
import random
import numpy as np
import igraph
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
import csv
import nltk

#model = 'svm'
model = 'logreg'
pred_file = "improved_predictions.csv"

In [None]:
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

with open("testing_set.txt", "r") as f:
    reader = csv.reader(f)
    testing_set  = list(reader)

testing_set = [element[0].split(" ") for element in testing_set]
        
# note: Kaggle requires that you add "ID" and "category" column headers

###############################
# beating the random baseline #
###############################

# the following script gets an F1 score of approximately 0.66

# data loading and preprocessing 

# the columns of the data frame below are: 
# (1) paper unique ID (integer)
# (2) publication year (integer)
# (3) paper title (string)
# (4) authors (strings separated by ,)
# (5) name of journal (optional) (string)
# (6) abstract (string) - lowercased, free of punctuation except intra-word dashes

with open("training_set.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)

training_set = [element[0].split(" ") for element in training_set]

with open("node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)

IDs = [element[0] for element in node_info]

# compute TFIDF vector of each paper
corpus = [element[5] for element in node_info]
vectorizer = TfidfVectorizer(stop_words="english")
# each row is a node in the order of node_info
features_TFIDF = vectorizer.fit_transform(corpus)

## the following shows how to construct a graph with igraph
## even though in this baseline we don't use it
## look at http://igraph.org/python/doc/igraph.Graph-class.html for feature ideas

edges = [(element[0],element[1]) for element in training_set if element[2]=="1"]

## some nodes may not be connected to any other node
## hence the need to create the nodes of the graph from node_info.csv,
## not just from the edge list

nodes = IDs

## create empty directed graph
g = igraph.Graph(directed=True)
 
## add vertices
g.add_vertices(nodes)
 
## add edges
g.add_edges(edges)

## Find the giant connected component
gcc = []
for idx, v in enumerate(g.components().giant().vs):
    gcc.append(v['name'])

# for each training example we need to compute features
# in this baseline we will train the model on only 5% of the training set

# randomly select 5% of training set
to_keep = random.sample(range(len(training_set)), k=int(round(len(training_set)*0.05)))
training_set_reduced = [training_set[i] for i in to_keep]

# we will use three basic features:

# number of overlapping words in title
overlap_title = []

# temporal distance between the papers
temp_diff = []

# number of common authors
comm_auth = []

# TD-IDF cosine similarity
cos_similarity = []

#Graph related features
jaccard = []
jaccard_in = []
jaccard_out = []
degrees = []
in_gcc = np.zeros(len(training_set_reduced))
shortest_path = []

counter = 0
for i in range(len(training_set_reduced)):
    source = training_set_reduced[i][0]
    target = training_set_reduced[i][1]
    
    index_source = IDs.index(source)
    index_target = IDs.index(target)
    
    source_info = [element for element in node_info if element[0]==source][0]
    target_info = [element for element in node_info if element[0]==target][0]
    
	# convert to lowercase and tokenize
    source_title = source_info[2].lower().split(" ")
	# remove stopwords
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    
    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",")
    
    overlap_title.append(len(set(source_title).intersection(set(target_title))))
    temp_diff.append(int(source_info[1]) - int(target_info[1]))
    comm_auth.append(len(set(source_auth).intersection(set(target_auth))))
    
    cos_similarity.append(cosine_similarity(features_TFIDF.getrow(index_source),
                                    features_TFIDF.getrow(index_target))[0][0])
    
    jaccard.append(g.similarity_jaccard(pairs = [(index_source, index_target)],
                                              loops = False)[0])
    jaccard_in.append(g.similarity_jaccard(pairs = [(index_source, index_target)],
                                              mode = 'IN', loops = False)[0])
    jaccard_out.append(g.similarity_jaccard(pairs = [(index_source, index_target)],
                                              mode = 'OUT', loops = False)[0])
    
    degrees.append(g.strength(index_source)*g.strength(index_target))
    #if(source_info[0] in gcc or target_info[0] in gcc):
        #in_gcc[i] = 1
        
    u = g.shortest_paths_dijkstra(source=index_source, target=index_target, mode=3)[0][0]
    if u >1500000:
        shortest_path.append(-1)
    else:
        shortest_path.append(u)
    
   
    counter += 1
    if counter % 1000 == True:
        print (counter, "training examples processsed")

# convert list of lists into array
# documents as rows, unique words as columns (i.e., example as rows, features as columns)
training_features = np.array([overlap_title, temp_diff, comm_auth, 
                             jaccard, jaccard_in, jaccard_out, 
                             degrees, shortest_path, cos_similarity]).T

# scale
training_features = preprocessing.scale(training_features)

# convert labels into integers then into column array
labels = [int(element[2]) for element in training_set_reduced]
labels = list(labels)
labels_array = np.array(labels)

#Creating cross validation data splits
cv_sets = StratifiedShuffleSplit(n_splits = 2, test_size = 0.20, random_state = 5)
cv_sets.get_n_splits(training_features, labels_array)


# test
# we need to compute the features for the testing set

overlap_title_test = []
temp_diff_test = []
comm_auth_test = []
cos_similarity_test = []
#Graph related features
jaccard_test = []
jaccard_in_test = []
jaccard_out_test = []
degrees_test = []
in_gcc_test = np.zeros(len(testing_set))
shortest_path_test = []
   
counter = 0
for i in range(len(testing_set)):
    source = testing_set[i][0]
    target = testing_set[i][1]
    
    index_source = IDs.index(source)
    index_target = IDs.index(target)
    
    source_info = [element for element in node_info if element[0]==source][0]
    target_info = [element for element in node_info if element[0]==target][0]
    
    source_title = source_info[2].lower().split(" ")
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    
    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",")
    
    overlap_title_test.append(len(set(source_title).intersection(set(target_title))))
    temp_diff_test.append(int(source_info[1]) - int(target_info[1]))
    comm_auth_test.append(len(set(source_auth).intersection(set(target_auth))))
    
    cos_similarity_test.append(cosine_similarity(features_TFIDF.getrow(index_source), 
                                            features_TFIDF.getrow(index_target))[0][0])
    
    jaccard_test.append(g.similarity_jaccard(pairs = [(index_source, index_target)], 
                                              loops=False)[0])
    jaccard_in_test.append(g.similarity_jaccard(pairs = [(index_source, index_target)], 
                                              mode = 'IN', loops=False)[0])
    jaccard_out_test.append(g.similarity_jaccard(pairs = [(index_source, index_target)], 
                                              mode = 'OUT', loops=False)[0])
    
    degrees_test.append(g.strength(index_source)*g.strength(index_target))
    #if(source_info[0] in gcc or target_info[0] in gcc):
        #in_gcc_test[i] = 1
    
    u = g.shortest_paths_dijkstra(source=index_source, target=index_target, mode=3)[0][0]
    if u >1500000:
        shortest_path_test.append(-1)
    else:
        shortest_path_test.append(u)
   
    counter += 1
    if counter % 1000 == True:
        print (counter, "testing examples processsed")
        
# convert list of lists into array
# documents as rows, unique words as columns (i.e., example as rows, features as columns)
testing_features = np.array([overlap_title_test,temp_diff_test,comm_auth_test, 
                            jaccard_test, jaccard_in_test, jaccard_out_test,
                            degrees_test, shortest_path_test, cos_similarity_test]).T

# scale
testing_features = preprocessing.scale(testing_features)


        
#number of common neighbours
#something with degrees
#shortest_paths_dijkstra
#Cuts : st_cuts or st_mincuts or all_st_(min)cuts
#Node ID ?
#In core of graph? 
#Clusters or k_core
# vertex/edge_disjoint_paths or cohesion

In [4]:
# initialize classifier
def train_model(model):
    if(model == 'svm'):
        classifier = svm.SVC()
        parameters = {"kernel": ["rbf"], 'C': [1, 10, 20, 50, 100], 'gamma' : [1e-3]}
        return classifier, parameters
    elif(model == 'logreg'):
        classifier = LogisticRegression()
        parameters = {"multi_class": ["multinomial"], "penalty": ["l2"], 
            'C': [1, 10, 100], "solver": ["newton-cg", "sag"], "max_iter": [300]}
        return classifier, parameters
    else:
        print('Please choose a correct classifier')
        return

# train
classifier, parameters = train_model(model)
best_classifier = GridSearchCV(classifier, parameters, cv = cv_sets)
best_classifier.fit(training_features, labels_array)

# issue predictions
predictions_SVM = list(classifier.predict(testing_features))

# write predictions to .csv file suitable for Kaggle (just make sure to add the column names)
predictions_SVM = zip(range(len(testing_set)), predictions_SVM)

headers = ['id', 'category']
with open(pred_file,"w") as pred1:
    csv_out = csv.writer(pred1)
    csv_out.writerow(i for i in headers)
    for row in predictions_SVM:
        csv_out.writerow(row)

NameError: name 'cv_sets' is not defined