In [186]:
import os, glob, string, re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn import preprocessing
from sklearn.decomposition import LatentDirichletAllocation
from keras.models import Sequential
from keras.layers import LSTM, Dense, Activation, Embedding, Dropout, TimeDistributed
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras import metrics
# import matplotlib.pyplot as plt
# import seaborn as sns

In [187]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from numpy.random import seed
seed(9)
from tensorflow import set_random_seed
set_random_seed(9)

In [188]:
classes = ["student", "faculty", "staff", "department", "course", "project", "other"]
class_index = dict((c, i) for i, c in enumerate(classes))

In [189]:
print(class_index)

{'student': 0, 'faculty': 1, 'staff': 2, 'department': 3, 'course': 4, 'project': 5, 'other': 6}


In [190]:
"""
Create list of all files, along with their classes
all_files varia
all_files : {"student": [["file_path"],["file_path"]], "course": [["file_path2", "file_path3"]]}
"""
all_files = {}
path = "data/raw/webkb/"
all_folders = os.listdir(path)
for clz in all_folders:
    if clz.startswith('.'):
        continue
    if clz not in all_files:
        all_files[clz] = []
    path_with_clz = path + clz + '/'
    all_univs = os.listdir(path_with_clz)
    for univ in all_univs:
        if univ.startswith('.'):
            continue
        path_with_univs = path_with_clz + univ + '/'
        all_files[clz].append(glob.glob(os.path.join(path_with_univs, '*')))

In [191]:
# print(all_files["department"])

In [192]:
"""
to take only sample of all files
"""
short_all_files = {}
max_count = 2
print(all_files.keys())
for k, v in all_files.items():
    if k not in short_all_files:
            short_all_files[k] = []
    short_all_files[k] = v[:max_count]

dict_keys(['faculty', 'course', 'other', 'student', 'department', 'project', 'staff'])


In [193]:
# print(short_all_files['student'][:3])

In [194]:
read_local = False

In [195]:
if not read_local:
    raw = []
    for k, v in all_files.items():
        for fnames in v:
            for fs in fnames:
                with open(fs, 'rb') as f:
                    raw_data = f.read()
                    raw.append([raw_data, class_index[k]])

    raw_df = pd.DataFrame(raw, columns=["text", "Class"])
else:
    raw_df = pd.read_csv('raw.csv')

In [196]:
print(raw_df.shape)

(8282, 2)


In [197]:
# raw_df.to_csv('raw.csv', header=True, index=False)

In [198]:
no_features = 100
lemmatizer = WordNetLemmatizer()
tfidf_vectorizer = TfidfVectorizer(max_features=no_features)
tfidf_vectorizer2 = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features)
tfidf_vectorizer3 = TfidfVectorizer(ngram_range=(1, 2), max_df=0.95, min_df=2, max_features=no_features)
tf_vectorizer = CountVectorizer(max_features=no_features)
stopset = set(stopwords.words('english'))

In [199]:
def process(txt):
    cleantext = BeautifulSoup(txt, "lxml").text
    tokens = []
    for token in wordpunct_tokenize(cleantext):
        if token.isdigit():
            continue
        if all(char in string.punctuation for char in token):
            continue
        
        token = token.lower()
        token = token.strip()  # Strip whitespace and other punctuations
        token = token.strip('_')  # remove _ if any
        token = token.strip('*')
        if token in stopset:
            continue
        tokens.append(token)
        lemmatizer.lemmatize(token)
#     x = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

In [200]:
def vectorize(txt, vectorizer):
    X = vectorizer.fit_transform(txt)
    return X

In [201]:
def tf_vectorization(txt):
    X_tf = tf_vectorizer.fit_transform(txt)
    return X_tf

In [202]:
# df = raw_df.sample(frac=0.1, replace=True)
df = raw_df
df['processed_text'] = df['text'].apply(process)
df['processed_text_cnct'] = df['processed_text'].apply(lambda tokens: ' '.join(str(v) for v in tokens))

In [203]:
# df = df[(df['Class'] ==  0) | (df['Class'] ==  1) | (df['Class'] ==  4) | (df['Class'] ==  5) | (df['Class'] ==  6)]

In [204]:
class_counts = df.groupby(['Class']).size()
print(class_counts)
print(df.shape)
# class_counts_raw = raw_df.groupby(['Class']).size()
# print(class_counts_raw)

Class
0    1641
1    1124
2     137
3     182
4     930
5     504
6    3764
dtype: int64
(8282, 4)


In [205]:
vocab_size = 30000
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(df['processed_text_cnct'])
sequences = tokenizer.texts_to_sequences(df['processed_text_cnct'])

In [206]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 77428 unique tokens.


In [207]:
avg = sum( map(len, sequences) ) / len(sequences)
std = np.sqrt(sum( map(lambda x: (len(x) - avg)**2, sequences)) / len(sequences))

print(avg,std)

337.04503743057234 934.8175976412838


In [208]:
# Average text is 300 in length, let's restrict sequence length to 150 words.
max_length = 100

In [209]:
data = pad_sequences(sequences, maxlen=max_length)

In [210]:
labels = to_categorical(np.asarray(df['Class']))
print('Shape of data:', data.shape)
print('Shape of labels:', labels.shape)

Shape of data: (8282, 100)
Shape of labels: (8282, 7)


In [211]:
data[1, :]

array([14851,  4130,   610,   578,     7,  1142,   245,   683,   829,
        4917,  4918,    28,  4917,  4918,  1350,   497,     6,    35,
         195,   207,    28,     2,    72,   487,  1790,    72,   487,
         497,   709,  4299,     2,    68,   709,   284,     3,    68,
          84,   376,    28,     2,  2306,  3034,  2306,    80,   250,
         709,  1944,     3,    68,    42,  1637,  2684,   306,    13,
          33,     3,   245,     6,   733, 21662,     6,    13,    33,
         198,     3,   668,   383,   846,     5, 19290,     3,    89,
         151,   146,   262,     6,    13,    33,  1218,  1328,   480,
       19289,    57,     6,  3109,    35,   195,     6,    57,     3,
         575,   499,   846, 16019, 14852,  7609, 12373,   925,  1218,
        1328], dtype=int32)

In [212]:
glove_dir = 'data/glove'
embeddings_index = {} # We create a dictionary of word -> embedding
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt')) # Open file
# In the dataset, each line represents a new word embedding
# The line starts with the word and the embedding values follow
for line in f:
    values = line.split()
    word = values[0] # The first value is the word, the rest are the values of the embedding
    embedding = np.asarray(values[1:], dtype='float32') # Load embedding
    embeddings_index[word] = embedding # Add embedding to our embedding dictionary
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [213]:
# Create a matrix of all embeddings
all_embs = np.stack(embeddings_index.values())
emb_mean = all_embs.mean() # Calculate mean
emb_std = all_embs.std() # Calculate standard deviation
emb_mean,emb_std

(0.004451992, 0.4081574)

In [214]:
embedding_dim = 100 # We use 250 dimensional glove vectors

In [215]:
nb_words = min(vocab_size, len(word_index)) # How many words are there actually

# Create a random matrix with the same mean and std as the embeddings
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_dim))

# The vectors need to be in the same position as their index. 
# Meaning a word with token 1 needs to be in the second row (rows start with zero) and so on

# Loop over all words in the word index
for word, i in word_index.items():
    # If we are above the amount of words we want to use we do nothing
    if i >= vocab_size: 
        continue
    # Get the embedding vector for the word
    embedding_vector = embeddings_index.get(word)
    # If there is an embedding vector, put it in the embedding matrix
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

In [216]:
X = data
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [217]:
def counts(y):
    counts = {}
    for row in y:
        for idx, label_val in enumerate(row):
            if idx not in counts:
                counts[idx] = 0
            if label_val == 1:
                counts[idx] += 1
    return counts

In [218]:
print(counts(y_train))
print(counts(y_test))

{0: 1305, 1: 887, 2: 105, 3: 147, 4: 760, 5: 408, 6: 3013}
{0: 336, 1: 237, 2: 32, 3: 35, 4: 170, 5: 96, 6: 751}


In [219]:
model = Sequential()
# model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length, weights = [embedding_matrix], trainable = False))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(7))
model.add(Activation('softmax'))

In [220]:
embedding_matrix.shape

(30000, 100)

In [221]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 100, 100)          3000000   
_________________________________________________________________
lstm_13 (LSTM)               (None, 100, 64)           42240     
_________________________________________________________________
dropout_13 (Dropout)         (None, 100, 64)           0         
_________________________________________________________________
lstm_14 (LSTM)               (None, 64)                33024     
_________________________________________________________________
dropout_14 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 7)                 455       
_________________________________________________________________
activation_7 (Activation)    (None, 7)                 0         
Total para

In [222]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[metrics.mae, metrics.categorical_accuracy])

In [223]:
batch_size = 1000
model.fit(X_train, y_train, batch_size=batch_size, epochs=25, validation_split=0.1)

Train on 5962 samples, validate on 663 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x130032c50>

In [228]:
y_pred = model.predict(X_test)

In [229]:
y_pred_mod = []
for row in y_pred:
    max_val = max(row)
    mod_label = []
    for label_val in row:
        if label_val < max_val:
            mod_label.append(0.)
        else:
            mod_label.append(1.)
    y_pred_mod.append(mod_label)

In [230]:
print(y_pred[:1])
print(y_pred_mod[:1])
print(y_test[:1])
print(counts(y_test))

[[0.06001716 0.67120445 0.01769655 0.02491975 0.02603431 0.04097567
  0.1591521 ]]
[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
[[0. 1. 0. 0. 0. 0. 0.]]
{0: 336, 1: 237, 2: 32, 3: 35, 4: 170, 5: 96, 6: 751}


In [231]:
# confusion_matrix(y_pred_mod, y_test)
# cm = confusion_matrix(y_test.argmax(axis=1), np.array(y_pred_mod).argmax(axis=1))
c_matrix(y_test, np.array(y_pred_mod), num_classes=7)

[[213  40   0   0   6   2  75]
 [ 39 125   0   5  13   4  51]
 [ 16   5   0   0   0   1  10]
 [  2   2   0  15   4   0  12]
 [  4   2   0   2 111   1  50]
 [  3   6   0   1   2  13  71]
 [ 27  29   0   3  75   5 612]]
[ 91  84   0  11 100  13 269]
accuracy [0.87085094 0.88171394 0.98068799 0.98129149 0.90404345 0.94206397
 0.75377188]
precision [0.70065789 0.59808612        nan 0.57692308 0.52606635 0.5
 0.69466515]
recall [0.63392857 0.52742616 0.         0.42857143 0.65294118 0.13541667
 0.81491345]
f1 [0.665625   0.56053812        nan 0.49180328 0.58267717 0.21311475
 0.75      ]




(array([0.70065789, 0.59808612,        nan, 0.57692308, 0.52606635,
        0.5       , 0.69466515]),
 array([0.63392857, 0.52742616, 0.        , 0.42857143, 0.65294118,
        0.13541667, 0.81491345]),
 array([0.87085094, 0.88171394, 0.98068799, 0.98129149, 0.90404345,
        0.94206397, 0.75377188]),
 array([0.665625  , 0.56053812,        nan, 0.49180328, 0.58267717,
        0.21311475, 0.75      ]))

In [70]:
X_tf = vectorize(df['processed_text_cnct'], tfidf_vectorizer3)

no_topics = 5
num_iter = 5

lda = LatentDirichletAllocation(n_components=no_topics, max_iter=num_iter, learning_method='online', learning_offset=50.,random_state=9, evaluate_every=100).fit(X_tf)

lda_x = lda.transform(X_tf)
print(lda_x.shape)

tf_feature_names = tfidf_vectorizer3.get_feature_names()

(8282, 5)


In [121]:
no_top_words = 10
topic_indices = []
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        top_features = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
        print(" ".join(top_features))

display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
computer science university computer science page home home page research department ni
Topic 1:
nthe program one class use file course code problem time
Topic 2:
ndate version length nlast html ncontent jan gmt ncontent nlast modified length ncontent length
Topic 3:
nserver ncsa ncsa tue date ncsa ncontent gmt nserver html nlast page nov date tue
Topic 4:
systems research parallel system computer software project data design distributed


In [138]:
X_tfidf = vectorize(df['processed_text_cnct'], tfidf_vectorizer3)
X = X_tfidf
# X = X.todense()
y = df['Class']
# X = lda_x

In [139]:
print(X.shape)

(7963, 100)


In [140]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0, stratify=y)

In [141]:
scaler = preprocessing.StandardScaler(with_mean = False).fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [142]:
for i in range(7):
    print("#{} in train set: {}".format(i, len(y_train[y_train == i])))
    print("#{} in test set: {}".format(i, len(y_test[y_test == i])))

#0 in train set: 1231
#0 in test set: 410
#1 in train set: 843
#1 in test set: 281
#2 in train set: 0
#2 in test set: 0
#3 in train set: 0
#3 in test set: 0
#4 in train set: 697
#4 in test set: 233
#5 in train set: 378
#5 in test set: 126
#6 in train set: 2823
#6 in test set: 941


In [153]:
scoring = ['precision_macro', 'recall_macro']
estimators = []
clf1 = MultinomialNB()
estimators.append(clf1)
clf2 = svm.SVC(C=100, kernel='linear')
estimators.append(clf2)
clf3 = svm.SVC(C=100, kernel='rbf', gamma=0.01)
estimators.append(clf3)
eclf = VotingClassifier(estimators=[('nb', clf1), ('svml', clf2), ('svmr', clf3)], voting='hard')
for clf, label in zip([clf1, clf2, clf3, eclf], ['Naive Bayes', 'SVM Linear', 'SVM RBF', 'Ensemble']):
    scores = cross_val_score(clf, X, y, cv=5, scoring='f1_micro')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.64 (+/- 0.02) [Naive Bayes]
Accuracy: 0.74 (+/- 0.04) [SVM Linear]
Accuracy: 0.75 (+/- 0.04) [SVM RBF]
Accuracy: 0.74 (+/- 0.04) [Ensemble]


NameError: name 'grid' is not defined

In [154]:
# clf = svm.SVC(C=100, kernel='linear')
# clf = MultinomialNB()
clf = eclf
clf.fit(X_train, y_train)

VotingClassifier(estimators=[('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)), ('svml', SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None)

In [144]:
# print(clf.n_support_)

[573 478 366 271 837]


In [155]:
y_pred = clf.predict(X_test)

In [156]:
y_true = y_test

In [172]:
def c_matrix(y_true, y_pred, num_classes=7):
    cm = confusion_matrix(y_true.argmax(axis=1), y_pred.argmax(axis=1))
    print(cm)
    cm_np = np.asarray(cm)
    TP = np.diag(cm_np)
#     print(TP)
    FP = np.sum(cm, axis=0) - TP
    print(FP)
    FN = np.sum(cm, axis=1) - TP
#     print(FN)
    TN = []
    for i in range(num_classes):
        temp = np.delete(cm, i, 0)    # delete ith row
        temp = np.delete(temp, i, 1)  # delete ith column
        TN.append(sum(sum(temp)))
#     print(TN)
    prec = TP/(TP+FP)
    rec = TP/(TP+FN)
    acc = (TP+TN)/(TP+FP+TN+FN)
    f1 = 2*prec*rec/(prec+rec)

    print("accuracy", acc)
    print("precision", prec)
    print("recall", rec)
    print("f1", f1)
    return prec, rec, acc, f1

In [173]:
# cm = confusion_matrix(y_true, y_pred)
cm = confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1))
print(cm)
cm_np = np.asarray(cm)

[[  2  15   0   0  73   0 246]
 [  1   5   0   1  62   0 168]
 [  0   1   0   0  10   0  21]
 [  0   1   0   0  11   0  23]
 [  1   8   0   0  45   0 116]
 [  1   4   0   0  29   0  62]
 [  7  23   1   0 209   0 511]]


In [240]:
TP = np.diag(cm_np)
print(TP)

[ 670  439    4   59  354  108 1681]


In [241]:
FP = np.sum(cm, axis=0) - TP
print(FP)

[-658 -382   -3  -58   85 -108 -534]


In [254]:
FN = np.sum(cm, axis=1) - TP
print(FN)

[ 60  40  14  10  49  33 120]


In [255]:
num_classes = 7
TN = []
for i in range(num_classes):
    temp = np.delete(cm, i, 0)    # delete ith row
    temp = np.delete(temp, i, 1)  # delete ith column
    TN.append(sum(sum(temp)))
print(TN)

[592, 644, 813, 804, 699, 738, 358]


In [256]:
prec = TP/(TP+FP)
rec = TP/(TP+FN)
acc = (TP+TN)/(TP+FP+TN+FN)
f1 = 2*prec*rec/(prec+rec)

print("accuracy", acc)
print("precision", prec)
print("recall", rec)
print("f1", f1)

accuracy [0.83956574 0.86489747 0.98069964 0.97949337 0.89626055 0.91073583
 0.74185766]
precision [0.58757062 0.50344828 0.         0.53333333 0.54320988 0.29310345
 0.73219373]
recall [0.63414634 0.6460177  0.         0.44444444 0.47311828 0.34
 0.68169761]
f1 [0.60997067 0.56589147        nan 0.48484848 0.50574713 0.31481481
 0.70604396]


  after removing the cwd from sys.path.


In [None]:
def plot_eval_metrics(x, y, y2, y3, y4, img_name):
    fig, ax = plt.subplots()
    y_label = "value"
    x_label = "classes"
    plt.plot(x, y, color='b', marker='o', label="Accuracy")
    plt.plot(x, y2, color='g', marker='+', label="Precision")
    plt.plot(x, y3, color='y', marker='x', label="Recall")
    plt.plot(x, y4, color='r', marker='s', label="F1-measure")
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.legend()
#     plt.scatter(x, y, label="Accuracy")
#     plt.scatter(x, y2, label="Precision")
#     plt.scatter(x, y3, label="Recall")
#     plt.scatter(x, y4, label="F1-measure")
    plt.show()
    fig.savefig(img_name)
#     plt.close()

In [None]:
# plot_eval_metrics(classes, acc, prec, rec, f1, "tfidf-mNB.png")