```
ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict object;
ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.

print(x.shape)                  (140, 1433)
print(tx.shape)                 (1000, 1433)
print(allx.shape)               (1708, 1433)
print(y.shape)                  (140, 7)
print(ty.shape)                 (1000, 7)
print(ally.shape)               (1708, 7)
print(len(graph.keys()))        2708
```

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
data = pd.read_csv(r'goemotions/goemotions_1.csv') # load in the data
# going to use a sample of the dataset first since it is so large
# adj will be saved as the graph
# x corresponds to document nodes xall will include the word nodes
# feature vectors will just be the identity matrix for now

In [None]:
emotions = data.columns[9:] # save the name of the emotions columns
yval = data.iloc[:,[1,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36]].groupby('id').sum()
for c in yval.columns:
    yval.loc[yval[c]!=0,c] = 1 # union of all anotators labeling

xval = data.iloc[:,[0,1]].drop_duplicates().set_index('id') # remove duplicates
data = pd.concat([xval,yval],join="inner",axis=1) # join the text and labels on the same ID
data = data.drop(data.index[data[emotions].sum(axis=1) == 0]) # remove rows with no labels

In [None]:
pd.DataFrame([ty.sum(axis=0),yall.sum(axis=0)]).transpose() # looking at the counts of each label

Unnamed: 0,0,1
admiration,1511,3169
amusement,754,1779
anger,786,1484
annoyance,1408,2634
approval,1868,3504
caring,602,1203
confusion,809,1390
curiosity,960,1868
desire,391,736
disappointment,923,1630


In [None]:
import regex as re
import emoji
import contractions
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_custom(text):
    t1 = re.sub(r'https?://\S+|www\.\S+', '', text)     # remove URLS
    t2 = re.sub("@[A-Za-z0-9_]+","", t1)                # remove user mentions
    t3 = re.sub("#","",t2)                              # remove '#' but keep the hashtag
    t4 = re.sub(r'(.)\1+', r'\1\1\1', t3)               # Reduce the length of repeated characters
    t5 = emoji.demojize(t4)                             # Replace emojis

    # # Replace slang TODO
    t6 = t5

    t7 = re.sub(r'[0-9]+', '', t6)                      # Remove numbers 0-9
    t8 = contractions.fix(t7)                           # Expanding contractions'
    t9 = re.sub(r"[^\P{P}!?]+","",t8)                   # remove puntuation except for ! and ?
    
    # TODO remove abbrieviations
    t10 = t9

    t11 = " ".join(t10.split())                         # Remove extra whitespace

    word_list = nltk.word_tokenize(t11)
    # remove stop words
    filtered_sentence = [w for w in word_list if not w.lower() in stop_words]
    # Lemmatization
    t12 = ' '.join([lemmatizer.lemmatize(w) for w in filtered_sentence]).lower()
    return t12

xval = data.text.apply(preprocess_custom)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kimkiamco/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/kimkiamco/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kimkiamco/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/kimkiamco/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
xval.head()

id
eew5j0j                                            game hurt
ed2mah1                                    right care fuck !
eeibobj                                     man love redddit
eda6yn6                             name nowhere near falcon
eespn2i    right ? considering important document know da...
Name: text, dtype: object

In [None]:
import scipy
from scipy import sparse
vectorizer = CountVectorizer()
xval = vectorizer.fit_transform(xval) # this returns a document term frequency count

In [None]:
z1 = sparse.csr_matrix(np.zeros([xval.shape[0],xval.shape[0]])) # create  matrix of zeros
z2 = sparse.csr_matrix(np.zeros([xval.shape[1],xval.shape[1]])) # create  matrix of zeros
graph = sparse.vstack([sparse.hstack([z1,xval]),sparse.hstack([xval.transpose(),z2])]) # W = [0 x; xt 0]

```
print(x.shape)                  (140, 1433)     (only train docs, parameter)
print(tx.shape)                 (1000, 1433)    (only test docs, paramet)
print(allx.shape)               (1708, 1433)    (num train docs + num words, paramet)
print(y.shape)                  (140, 7)        (train docs, 28)
print(ty.shape)                 (1000, 7)       (test docs, 28)
print(ally.shape)               (1708, 7)       (num train docs + num words, 28)
print(len(graph.keys()))        2708            (num docs + num words)
```

In [None]:
# convert graph to dictionary
ndocs_nwords = graph.shape[0]
print(ndocs_nwords)
nwords = ndocs_nwords - data.shape[0]
print(nwords)

68636
25619


In [None]:
n_features = 100

zy = np.zeros([nwords,len(emotions)])
testidx = int(data.shape[0]*0.3) # test index will just be the first 0.3% of the dataset
ty = data[emotions][0:testidx]
tx = np.identity(testidx)[:,:n_features]
y = data[emotions][testidx:]
x = np.identity(y.shape[0])[:,:n_features]
ally = np.vstack([np.array(y),zy])
allx = np.identity(yall.shape[0])[:,:n_features]

print(x.shape)
print(tx.shape)
print(allx.shape)
print(y.shape)
print(ty.shape)
print(ally.shape)
print(graph.shape)

test_index = list(range(testidx))

(30112, 100)
(12905, 100)
(55731, 100)
(30112, 28)
(12905, 28)
(55731, 28)
(68636, 68636)


In [None]:
type(sparse.csr_array(graph))


scipy.sparse._arrays.csr_array

In [None]:
import pickle
# save variables
with open('gcn_v2/data/ind.goemo.test.index', 'wb') as f:
    pickle.dump(test_index, f)
with open('gcn_v2/data/ind.goemo.x', 'wb') as f:
    pickle.dump(x, f)
with open('gcn_v2/data/ind.goemo.tx', 'wb') as f:
    pickle.dump(tx, f)
with open('gcn_v2/data/ind.goemo.allx', 'wb') as f:
    pickle.dump(allx, f)
with open('gcn_v2/data/ind.goemo.y', 'wb') as f:
    pickle.dump(y, f)
with open('gcn_v2/data/ind.goemo.ty', 'wb') as f:
    pickle.dump(ty, f)
with open('gcn_v2/data/ind.goemo.ally', 'wb') as f:
    pickle.dump(ally, f)
with open('gcn_v2/data/ind.goemo.graph', 'wb') as f:
    pickle.dump(sparse.csr_array(graph), f)
