In [7]:
from __future__ import division
from __future__ import print_function
import time
import tensorflow as tf
from sklearn import metrics
from utils import *
from models import GCN, MLP
import random
import os
import sys

# if len(sys.argv) != 2:
# 	sys.exit("Use: python train.py <dataset>")

datasets = ['20ng', 'R8', 'R52', 'ohsumed', 'mr', 'twitter', "twitter_hate_off"]
dataset = "twitter_hate_off"

if dataset not in datasets:
	sys.exit("wrong dataset name")

In [8]:
# Set random seed
seed = random.randint(1, 200)
np.random.seed(seed)
tf.set_random_seed(seed)

# Settings
os.environ["CUDA_VISIBLE_DEVICES"] = ""

flags = tf.app.flags
FLAGS = flags.FLAGS
# 'cora', 'citeseer', 'pubmed'
flags.DEFINE_string('dataset', dataset, 'Dataset string.')
# 'gcn', 'gcn_cheby', 'dense'
flags.DEFINE_string('model', 'gcn', 'Model string.')
flags.DEFINE_float('learning_rate', 0.02, 'Initial learning rate.')
flags.DEFINE_integer('epochs', 400, 'Number of epochs to train.')
flags.DEFINE_integer('hidden1', 200, 'Number of units in hidden layer 1.')
flags.DEFINE_float('dropout', 0.8, 'Dropout rate (1 - keep probability).')
flags.DEFINE_float('weight_decay', 0,
                   'Weight for L2 loss on embedding matrix.')  # 5e-4
flags.DEFINE_integer('early_stopping', 10,
                     'Tolerance for early stopping (# of epochs).')
flags.DEFINE_integer('max_degree', 2, 'Maximum Chebyshev polynomial degree.')

# Load data
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, train_size, test_size = load_corpus(
    FLAGS.dataset)
# print(adj)

print("Features shape :: " , features.shape)
print("Features shape_0 :: " , features.shape[0])
# print("Features shape :: " , features.shape)
features = sp.identity(features.shape[0])  # featureless

# glove features

print(test_mask)


# exit(0)
print('embeddings:')
def loadGloveModel(gloveFile,words):
    print("Loading Glove Model")
    # path = os.path.join("data", "gcn_glove_" + dataset + ".json")
    # if os.path.exists(path):
    #     with open(path, "r") as f:
    #         return json.load(f)

    f = open(gloveFile,'r')

    all_words = set()
    for line in words:
        for word in line.split():
            all_words.add(word)
    
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0].strip().strip('<').strip('>')
        if word in all_words:
            embedding = np.array([float(val) for val in splitLine[1:]])
            model[word] = embedding
    print("Done.",len(model)," words loaded! Actual : ", len(all_words))
    
    # with open(path, "w") as f:
    #     json.dump(model, f)
    return model

dim = 100
f = open('data/corpus/' + dataset + '_vocab.txt', 'r')
words = f.readlines()
f.close()
glove_file_name = "./data/glove.twitter.27B."+ str(dim) +"d.txt"
glove_vectors = loadGloveModel(glove_file_name, words)
glove_embeddings = np.random.normal(size = (adj.shape[0], dim))
glove_embeddings[:train_size] = np.zeros(shape = (train_size, dim))
glove_embeddings[adj.shape[0] - test_size:] = np.zeros(shape = (test_size, dim))

for i in range(len(words)):
    word = words[i].strip()
    if word in glove_vectors:
        glove_embeddings[i + train_size, :] = glove_vectors[word]
    else:
        glove_embeddings[i + train_size] = np.random.normal(size = [dim])


word_embs = np.array(glove_embeddings)
print(adj.shape)
print(features.shape)
print(word_embs.shape)

# import cPickle as cp 
# with open('feature.data') as f:
#     cp.dump(features, f)

# Some preprocessing
features = preprocess_features(features)
if FLAGS.model == 'gcn':
    support = [preprocess_adj(adj)]
    num_supports = 1
    model_func = GCN
elif FLAGS.model == 'gcn_cheby':
    support = chebyshev_polynomials(adj, FLAGS.max_degree)
    num_supports = 1 + FLAGS.max_degree
    model_func = GCN
elif FLAGS.model == 'dense':
    support = [preprocess_adj(adj)]  # Not used
    num_supports = 1
    model_func = MLP
else:
    raise ValueError('Invalid argument for model: ' + str(FLAGS.model))

# Define placeholders
placeholders = {
    'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)],
    'features': tf.sparse_placeholder(tf.float32, shape=tf.constant(features[2], dtype=tf.int64)),
    'labels': tf.placeholder(tf.float32, shape=(None, y_train.shape[1])),
    'labels_mask': tf.placeholder(tf.int32),
    'dropout': tf.placeholder_with_default(0., shape=()),
    # helper variable for sparse dropout
    'num_features_nonzero': tf.placeholder(tf.int32)
}

# Create model
print("\n\n\n\n\nCreate model\n\n\n")
print(features[2][1])
model = model_func(
    placeholders, input_dim=features[2][1], logging=True, word_emb=word_embs.astype(np.float32), train_size = train_size, test_size = test_size)

# Initialize session
session_conf = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
sess = tf.Session(config=session_conf)
# Define model evaluation function
def evaluate(features, support, labels, mask, placeholders):
    t_test = time.time()
    feed_dict_val = construct_feed_dict(
        features, support, labels, mask, placeholders)
    outs_val = sess.run([model.loss, model.accuracy, model.pred, model.labels, model.hidden1], feed_dict=feed_dict_val)
    return outs_val[0], outs_val[1], outs_val[2], outs_val[3], outs_val[4],(time.time() - t_test)


# Init variables
sess.run(tf.global_variables_initializer())
cost_val = []

# Train model
for epoch in range(FLAGS.epochs):

    t = time.time()
    # Construct feed dictionary
    feed_dict = construct_feed_dict(
        features, support, y_train, train_mask, placeholders)
    feed_dict.update({placeholders['dropout']: FLAGS.dropout})
    # Training step
    outs = sess.run([model.opt_op, model.loss, model.accuracy,
                     model.hidden1], feed_dict=feed_dict)
    # Validation
    cost, acc, pred, labels, repre,duration = evaluate(
        features, support, y_val, val_mask, placeholders)
    cost_val.append(cost)

    print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]),
          "train_acc=", "{:.5f}".format(
              outs[2]), "val_loss=", "{:.5f}".format(cost),
          "val_acc=", "{:.5f}".format(acc), "time=", "{:.5f}".format(time.time() - t))

    if epoch > FLAGS.early_stopping and cost_val[-1] > np.mean(cost_val[-(FLAGS.early_stopping+1):-1]):
        print("Early stopping...")
        break

print("Optimization Finished!")


# Training
train_cost, train_acc, pred, labels,t_representation, train_duration = evaluate(
    features, support, y_train, train_mask, placeholders)
print("\n\n\n\nTrain set results:", "cost=", "{:.5f}".format(train_cost),
      "accuracy=", "{:.5f}".format(train_acc), "time=", "{:.5f}".format(train_duration))

train_pred = []
train_labels = []
print(len(train_mask))
for i in range(len(train_mask)):
    if train_mask[i]:
        train_pred.append(pred[i])
        train_labels.append(labels[i])

print("Train Precision, Recall and F1-Score...")
print(metrics.classification_report(train_labels, train_pred, digits=4))
print("Macro average Train Precision, Recall and F1-Score...")
print(metrics.precision_recall_fscore_support(train_labels, train_pred, average='macro'))
print("Micro average Train Precision, Recall and F1-Score...")
print(metrics.precision_recall_fscore_support(train_labels, train_pred, average='micro'))
print("Weight average Train Precision, Recall and F1-Score...")
print(metrics.precision_recall_fscore_support(train_labels, train_pred, average='weighted'))



# Testing
test_cost, test_acc, pred, labels,representation, test_duration = evaluate(
    features, support, y_test, test_mask, placeholders)
print("Test set results:", "cost=", "{:.5f}".format(test_cost),
      "accuracy=", "{:.5f}".format(test_acc), "time=", "{:.5f}".format(test_duration))

test_pred = []
test_labels = []
print(len(test_mask))
for i in range(len(test_mask)):
    if test_mask[i]:
        test_pred.append(pred[i])
        test_labels.append(labels[i])

print(representation)
fp = open("out_pred.txt", "w")
for i,j in zip(test_pred, test_labels):
    fp.write(str(i) + "," + str(j) + "\n")
fp.close()

print(metrics.confusion_matrix(test_labels,test_pred))



print("Test Precision, Recall and F1-Score...")
print(metrics.classification_report(test_labels, test_pred, digits=4))
print("Macro average Test Precision, Recall and F1-Score...")
print(metrics.precision_recall_fscore_support(test_labels, test_pred, average='macro'))
print("Micro average Test Precision, Recall and F1-Score...")
print(metrics.precision_recall_fscore_support(test_labels, test_pred, average='micro'))
print("Weight average Test Precision, Recall and F1-Score...")
print(metrics.precision_recall_fscore_support(test_labels, test_pred, average='weighted'))

UnrecognizedFlagError: Unknown command line flag 'f'

In [3]:
!pip install tensorflow

Collecting tensorflow
[?25l  Downloading https://files.pythonhosted.org/packages/d4/29/6b4f1e02417c3a1ccc85380f093556ffd0b35dc354078074c5195c8447f2/tensorflow-1.13.1-cp37-cp37m-manylinux1_x86_64.whl (92.6MB)
[K    100% |████████████████████████████████| 92.6MB 496kB/s ta 0:00:011    45% |██████████████▋                 | 42.3MB 1.2MB/s eta 0:00:43
[?25hCollecting tensorflow-estimator<1.14.0rc0,>=1.13.0 (from tensorflow)
[?25l  Downloading https://files.pythonhosted.org/packages/bb/48/13f49fc3fa0fdf916aa1419013bb8f2ad09674c275b4046d5ee669a46873/tensorflow_estimator-1.13.0-py2.py3-none-any.whl (367kB)
[K    100% |████████████████████████████████| 368kB 1.4MB/s ta 0:00:01
[?25hCollecting keras-preprocessing>=1.0.5 (from tensorflow)
[?25l  Downloading https://files.pythonhosted.org/packages/c0/bf/0315ef6a9fd3fc2346e85b0ff1f5f83ca17073f2c31ac719ab2e4da0d4a3/Keras_Preprocessing-1.0.9-py2.py3-none-any.whl (59kB)
[K    100% |████████████████████████████████| 61kB 1.4MB/s ta 0:00:011
[