In [1]:
import os, glob, string, re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn import preprocessing
from sklearn.decomposition import LatentDirichletAllocation
from keras.models import Sequential
from keras.layers import LSTM, Dense, Activation, Embedding, Dropout, TimeDistributed
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras import metrics
# import matplotlib.pyplot as plt
# import seaborn as sns

Using TensorFlow backend.


In [2]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from numpy.random import seed
seed(9)
from tensorflow import set_random_seed
set_random_seed(9)

In [3]:
classes = ["student", "faculty", "staff", "department", "course", "project", "other"]
class_index = dict((c, i) for i, c in enumerate(classes))

In [4]:
print(class_index)

{'student': 0, 'faculty': 1, 'staff': 2, 'department': 3, 'course': 4, 'project': 5, 'other': 6}


In [5]:
"""
Create list of all files, along with their classes
all_files varia
all_files : {"student": [["file_path"],["file_path"]], "course": [["file_path2", "file_path3"]]}
"""
all_files = {}
path = "data/raw/webkb/"
all_folders = os.listdir(path)
for clz in all_folders:
    if clz.startswith('.'):
        continue
    if clz not in all_files:
        all_files[clz] = []
    path_with_clz = path + clz + '/'
    all_univs = os.listdir(path_with_clz)
    for univ in all_univs:
        if univ.startswith('.'):
            continue
        path_with_univs = path_with_clz + univ + '/'
        all_files[clz].append(glob.glob(os.path.join(path_with_univs, '*')))

In [6]:
# print(all_files["department"])

In [7]:
"""
to take only sample of all files
"""
short_all_files = {}
max_count = 2
print(all_files.keys())
for k, v in all_files.items():
    if k not in short_all_files:
            short_all_files[k] = []
    short_all_files[k] = v[:max_count]

dict_keys(['faculty', 'course', 'other', 'student', 'department', 'project', 'staff'])


In [8]:
# print(short_all_files['student'][:3])

In [9]:
read_local = True

In [10]:
if not read_local:
    raw = []
    for k, v in all_files.items():
        for fnames in v:
            for fs in fnames:
                with open(fs, 'rb') as f:
                    raw_data = f.read()
                    raw.append([raw_data, class_index[k]])

    raw_df = pd.DataFrame(raw, columns=["text", "Class"])
else:
    raw_df = pd.read_csv('raw.csv')

In [11]:
print(raw_df.shape)

(8282, 2)


In [12]:
# raw_df.to_csv('raw.csv', header=True, index=False)

In [13]:
no_features = 100
lemmatizer = WordNetLemmatizer()
tfidf_vectorizer = TfidfVectorizer(max_features=no_features)
tfidf_vectorizer2 = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features)
tfidf_vectorizer3 = TfidfVectorizer(ngram_range=(1, 2), max_df=0.95, min_df=2, max_features=no_features)
tf_vectorizer = CountVectorizer(max_features=no_features)
stopset = set(stopwords.words('english'))

In [22]:
def process(txt):
    cleantext = BeautifulSoup(txt, "lxml").text
    tokens = []
    for token in wordpunct_tokenize(cleantext):
        if token.isdigit():
            continue
        if all(char in string.punctuation for char in token):
            continue
        
        token = token.lower()
        token = token.strip()  # Strip whitespace and other punctuations
        token = token.strip('_')  # remove _ if any
        token = token.strip('*')
        if token in stopset:
            continue
        tokens.append(token)
        lemmatizer.lemmatize(token)
#     x = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

In [23]:
def vectorize(txt, vectorizer):
    X = vectorizer.fit_transform(txt)
    return X

In [24]:
def tf_vectorization(txt):
    X_tf = tf_vectorizer.fit_transform(txt)
    return X_tf

In [25]:
# df = raw_df.sample(frac=0.1, replace=True)
df = raw_df
df['processed_text'] = df['text'].apply(process)
df['processed_text_cnct'] = df['processed_text'].apply(lambda tokens: ' '.join(str(v) for v in tokens))

In [26]:
class_counts = df.groupby(['Class']).size()
print(class_counts)
print(df.shape)
# class_counts_raw = raw_df.groupby(['Class']).size()
# print(class_counts_raw)

Class
0    1641
1    1124
2     137
3     182
4     930
5     504
6    3764
dtype: int64
(8282, 4)


In [None]:
vocab_size = 20000
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(df['processed_text_cnct'])
sequences = tokenizer.texts_to_sequences(df['processed_text_cnct'])

In [None]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
avg = sum( map(len, sequences) ) / len(sequences)
std = np.sqrt(sum( map(lambda x: (len(x) - avg)**2, sequences)) / len(sequences))

print(avg,std)

In [None]:
# Average text is 225 in length, let's restrict sequence length to 150 words.
max_length = 100

In [None]:
data = pad_sequences(sequences, maxlen=max_length)

In [None]:
labels = to_categorical(np.asarray(df['Class']))
print('Shape of data:', data.shape)
print('Shape of labels:', labels.shape)

In [None]:
data[1, :]

In [None]:
X = data
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify=y)

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(LSTM(32, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(32, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(7))
model.add(Activation('softmax'))

In [None]:
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[metrics.mae, metrics.categorical_accuracy])

In [None]:
batch_size = 1000
model.fit(X_train, y_train, batch_size=batch_size, epochs=20, validation_split=0.1)

In [None]:
y_pred = model.predict(X_test)

In [None]:
X_tf = vectorize(df['processed_text_cnct'], tfidf_vectorizer2)

no_topics = 15
num_iter = 5

lda = LatentDirichletAllocation(n_components=no_topics, max_iter=num_iter, learning_method='online', learning_offset=50.,random_state=9, evaluate_every=100).fit(X_tf)

lda_x = lda.transform(X_tf)
print(lda_x.shape)

tf_feature_names = tfidf_vectorizer2.get_feature_names()

In [None]:
no_top_words = 10
topic_indices = []
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        top_features = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
        print(" ".join(top_features))

display_topics(lda, tf_feature_names, no_top_words)

In [27]:
X_tfidf = vectorize(df['processed_text_cnct'], tfidf_vectorizer3)
X = X_tfidf
# X = X.todense()
y = df['Class']
# X = lda_x

In [28]:
print(X.shape)

(8282, 100)


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0, stratify=y)

In [30]:
scaler = preprocessing.StandardScaler(with_mean = False).fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [31]:
for i in range(7):
    print("#{} in train set: {}".format(i, len(y_train[y_train == i])))
    print("#{} in test set: {}".format(i, len(y_test[y_test == i])))

#0 in train set: 1231
#0 in test set: 410
#1 in train set: 843
#1 in test set: 281
#2 in train set: 103
#2 in test set: 34
#3 in train set: 136
#3 in test set: 46
#4 in train set: 697
#4 in test set: 233
#5 in train set: 378
#5 in test set: 126
#6 in train set: 2823
#6 in test set: 941


In [None]:
scoring = ['precision_macro', 'recall_macro']
estimators = []
clf1 = MultinomialNB()
estimators.append(clf1)
clf2 = svm.SVC(C=100, kernel='linear')
estimators.append(clf2)
eclf = VotingClassifier(estimators=[('nb', clf1), ('svm', clf2)], voting='hard')
for clf, label in zip([clf1, clf2, eclf], ['Naive Bayes', 'SVM', 'Ensemble']):
    scores = cross_val_score(clf, X, y, cv=5, scoring='f1_micro')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
# clf.fit(X_train, y_train)

In [43]:
# clf = svm.SVC(C=100, kernel='linear')
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [44]:
print(clf.n_support_)

AttributeError: 'MultinomialNB' object has no attribute 'n_support_'

In [45]:
y_pred = clf.predict(X_test)

In [46]:
y_true = y_test

In [47]:
cm = confusion_matrix(y_true, y_pred)
# cm = confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1))
print(cm)
cm_np = np.asarray(cm)

[[283  47  18  15   5  16  26]
 [ 52 163   5  17   5  33   6]
 [ 15   5   5   1   1   4   3]
 [  2   2   2  38   0   0   2]
 [ 19   8   1   4 163  11  27]
 [  8  11   0   5   5  86  11]
 [ 89  45   4  44  78 108 573]]


In [48]:
TP = np.diag(cm_np)
print(TP)

[283 163   5  38 163  86 573]


In [49]:
FP = np.sum(cm, axis=0) - TP
print(FP)

[185 118  30  86  94 172  75]


In [50]:
FN = np.sum(cm, axis=1) - TP
print(FN)

[127 118  29   8  70  40 368]


In [51]:
num_classes = 7
TN = []
for i in range(num_classes):
    temp = np.delete(cm, i, 0)    # delete ith row
    temp = np.delete(temp, i, 1)  # delete ith column
    TN.append(sum(sum(temp)))
print(TN)

[1476, 1672, 2007, 1939, 1744, 1773, 1055]


In [52]:
prec = TP/(TP+FP)
rec = TP/(TP+FN)
acc = (TP+TN)/(TP+FP+TN+FN)
f1 = 2*prec*rec/(prec+rec)

print("accuracy", acc)
print("precision", prec)
print("recall", rec)
print("f1", f1)

accuracy [0.84934814 0.88604539 0.97151135 0.9546113  0.9208112  0.89763399
 0.78609367]
precision [0.60470085 0.58007117 0.14285714 0.30645161 0.63424125 0.33333333
 0.88425926]
recall [0.6902439  0.58007117 0.14705882 0.82608696 0.69957082 0.68253968
 0.60892667]
f1 [0.64464692 0.58007117 0.14492754 0.44705882 0.66530612 0.44791667
 0.72120831]


In [None]:
def plot_eval_metrics(x, y, y2, y3, y4, img_name):
    fig, ax = plt.subplots()
    y_label = "value"
    x_label = "classes"
    plt.plot(x, y, color='b', marker='o', label="Accuracy")
    plt.plot(x, y2, color='g', marker='+', label="Precision")
    plt.plot(x, y3, color='y', marker='x', label="Recall")
    plt.plot(x, y4, color='r', marker='s', label="F1-measure")
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.legend()
#     plt.scatter(x, y, label="Accuracy")
#     plt.scatter(x, y2, label="Precision")
#     plt.scatter(x, y3, label="Recall")
#     plt.scatter(x, y4, label="F1-measure")
    plt.show()
    fig.savefig(img_name)
#     plt.close()

In [None]:
# plot_eval_metrics(classes, acc, prec, rec, f1, "tfidf-mNB.png")