In [53]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from gensim.utils import simple_preprocess
from gensim.models import KeyedVectors
import numpy as np
from experiment_baseplate import load_split_data
import pickle

In [57]:
# Prepare Embeddings
def load_glove_model(File):
    print("Loading Glove Model")
    glove_model = {}
    with open(File,'r') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    print(f"{len(glove_model)} words loaded!")
    return glove_model

glove_vectors = load_glove_model('pretrained/glove/glove.twitter.27B.200d.txt')

Loading Glove Model
1193514 words loaded!


In [36]:
def get_sentence_embedding(sentence):
    sentence_embedding = []
    for word in simple_preprocess(sentence):
        if word in glove_vectors:
            sentence_embedding.append(glove_vectors[word])
    if len(sentence_embedding) > 0:
        return np.mean(sentence_embedding, axis=0)
    else:
        return np.zeros(200)

In [58]:
# Load our data
X_train, y_train, X_validate, y_validate, X_test, y_test = load_split_data()

# Vectorize our sentences
X_train = np.array([get_sentence_embedding(sentence) for sentence in X_train])
X_validate = np.array([get_sentence_embedding(sentence) for sentence in X_validate])
X_test = np.array([get_sentence_embedding(sentence) for sentence in X_test])

In [59]:
clf = DecisionTreeClassifier()

# Train the classifier on the training data
print("Training...")
clf.fit(X_train, y_train)
print("Training finished...")

# Evaluate the accuracy of the classifier
accuracy = clf.score(X_validate, y_validate)

# Print the accuracy
print("Accuracy:", accuracy)

print(clf.predict([get_sentence_embedding(s) for s in ["You are shit", "You are in the shit", "You are an eye sore", "I don't like you", "I enjoy your company", "You are biatch", "You biiitch"]]))

Training...
Training finished...
Accuracy: 0.9043906698266184
[[1 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]]


In [61]:
clf2 = RandomForestClassifier()

# Train the classifier on the training data
print("Training...")
clf2.fit(X_train, y_train)
print("Training finished...")

# Evaluate the accuracy of the classifier
accuracy = clf2.score(X_validate, y_validate)

# Print the accuracy
print("Accuracy:", accuracy)

print(clf2.predict([get_sentence_embedding(s) for s in ["You are shit", "You are in the shit", "You are an eye sore", "I don't like you", "I enjoy your company", "You are biatch", "You biiitch"]]))

Training...
Training finished...
Accuracy: 0.9409380067356866
[[0 1]
 [1 0]
 [1 0]
 [0 1]
 [0 1]
 [0 1]
 [0 1]]


In [50]:
def predict(sentence, model):
    return model.predict([get_sentence_embedding(sentence)])

In [108]:
predict("I am telling nothing to worry about. It is alright.", clf2)

array([[1, 0]])

In [63]:
filename = 'rdf_glovetwitt.sav'
pickle.dump(clf2, open(filename, 'wb'))

0.9339528501933392
