### Choosing the right directory

In [1]:
from os import chdir
import numpy as np
import pickle
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
chdir('Library/')

### SVM Implementation

In [3]:
"""
@author: svakili

Purpose : Research Seminar - Link prediction
Baseline Test

Notes
------
colnames of NODE_INFOrmation.csv
# (1) SRC source name (string)
# (2) TGT target name (string)
# (3) VOT vote of the SRC for the TGT (integer)
# (4) RES vote of the majority for the TGT (integer)
# (5) YEA year of the vote (integer)
# (6) DAT date of the vote (string)
# (7) TXT text explaining the vote (string)
"""
# Python standard
import csv
# Machine learning libs
from sklearn import svm
from sklearn import metrics

# User scripts
from utilis import load_data

from features import Year


# Limit size of training set (None for no limit)
MAX_DATA_SIZE = None


#%% Load data

# the columns of the data frame below are:
# (1) paper unique ID (integer)
# (2) publication year (integer)
# (3) paper title (string)
# (4) authors (strings separated by ,)
# (5) name of journal (optional) (string)
# (6) abstract (string) - lowercased, free of punctuation except intra-word dashes

# paper id to index in NODE_INFO
ITRAIN, ITEST, YTRAIN, YTEST, NODE_INFO = load_data(max_data_size=MAX_DATA_SIZE)

# Concatenate previous prediction scores to Title/Author/Year features
FT = Year(NODE_INFO)
XTRAIN = FT.get_features(ITRAIN, 'train')
XTEST = FT.get_features(ITEST, 'test')

# Reshaping because 1D
XTRAIN = XTRAIN.reshape(-1, 1)
XTEST = XTEST.reshape(-1, 1)

# print 'Fitting Gaussian SVM'
# CLASSIFIER = svm.SVC(kernel='rbf', C=100, gamma=10, cache_size=4000, verbose=True)

# CLASSIFIER.fit(XTRAIN, YTRAIN)
# # Precict on test set
# PTEST = CLASSIFIER.predict(XTEST)


[train] (0%) processed (#1)
[train] (1%) processed (#1001)
[train] (1%) processed (#2001)
[train] (2%) processed (#3001)
[train] (3%) processed (#4001)
[train] (3%) processed (#5001)
[train] (4%) processed (#6001)
[train] (5%) processed (#7001)
[train] (5%) processed (#8001)
[train] (6%) processed (#9001)
[train] (7%) processed (#10001)
[train] (7%) processed (#11001)
[train] (8%) processed (#12001)
[train] (9%) processed (#13001)
[train] (9%) processed (#14001)
[train] (10%) processed (#15001)
[train] (11%) processed (#16001)
[train] (11%) processed (#17001)
[train] (12%) processed (#18001)
[train] (13%) processed (#19001)
[train] (13%) processed (#20001)
[train] (14%) processed (#21001)
[train] (15%) processed (#22001)
[train] (15%) processed (#23001)
[train] (16%) processed (#24001)
[train] (17%) processed (#25001)
[train] (17%) processed (#26001)
[train] (18%) processed (#27001)
[train] (19%) processed (#28001)
[train] (20%) processed (#29001)
[train] (20%) processed (#30001)
[trai

In [22]:
with open('../wikipedia_admin/results/baseline_predictions.csv', 'r') as f:
        reader = csv.reader(f)
        next(reader, None)
        PTEST = np.asarray([element[1] for element in reader]) 
        
# Evaluation metrics
ACCURACY = metrics.accuracy_score(YTEST, PTEST)
F1 = metrics.f1_score(YTEST, PTEST)
print 'Accuracy', ACCURACY
print 'F1', F1


Accuracy 0.729790796667
F1 0.615790774152


  sample_weight=sample_weight)
  'precision', 'predicted', average, warn_for)


### Test set distribution

In [30]:
unique, counts = np.unique(YTEST, return_counts=True)

print np.asarray((unique, counts)).T

[['-1' 10196L]
 ['0' 3198L]
 ['1' 36175L]]


### Prediction distribution

In [34]:
unique, counts = np.unique(PTEST, return_counts=True)

print np.asarray((unique, counts)).T

[['1' '49569']]
