In [None]:
import pandas
from sklearn import svm, metrics, model_selection
import numpy
import csv
import re

In [None]:
embeddings = {}
with open('data/glove.twitter.27B.25d.txt') as fh:
    for line in fh:
        line = line.strip().split(" ")
        embeddings[line[0]] = [float(t) for t in line[1:]]

In [None]:
def normalise(text):
    
    text = text.lower()
    text = text.replace("\n", " ")
    text = re.sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", " <url> ", text)
    text = text.replace("/", " / ")
    text = re.sub(r"@\w+", " <user> ", text)
    text = re.sub(r"[8:=;]['`\-][)d]+|[)d]+['`\-][8:=;]", " <smile> ", text)
    text = re.sub(r"[8:=;]['`\-]p+", " <lolface> ", text)
    text = re.sub(r"[8:=;]['`\-]\(+|\)+['`\-][8:=;]", " <sadface> ", text)
    text = re.sub(r"[8:=;]['`\-][\/|l*]", " <neutralface> ", text)
    text = text.replace(r"<3", " <heart> ")
    text = re.sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", " <number> ", text)
    text = text.replace(r"#", " <hashtag> ")
    text = re.sub(r"([!?.,()])+", r" \1 ", text)
    
    return text

In [None]:
def tokenise(text):
    return [t.strip() for t in text.strip().split()]

In [None]:
def calculate_embeddings(text):
    tokens = tokenise(normalise(text))
    token_embdgs = [embeddings.get(t) for t in tokens]
    token_embdgs = [t for t in token_embdgs if t is not None]
    if len(token_embdgs) == 0:
        print(text, tokens, token_embdgs)
    return numpy.mean(token_embdgs, axis=0)

In [None]:
features = []
labels = []

with open("/home/johannes/talk3/Talk3_extension/data/twitter_annotated_full.csv") as fh:
    reader = csv.DictReader(fh)
    for blop in reader:
        labels.append(blop["sentiment"])
        features.append(calculate_embeddings(blop["text"]))

In [None]:
set(labels)

In [None]:
features_train, features_test, labels_train, labels_test = model_selection.train_test_split(features, labels, test_size=0.25, random_state=0)

In [None]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [0.1, 1, 10, 100, 1000, 10000, 100000]},
                    {'kernel': ['linear'], 'C': [0.1, 1, 10, 100, 1000, 10000, 100000]},
                    {'kernel': ['poly'], 'degree': [2, 3, 4,5], 'coef0': [0, 1], 'C': [0.1, 1, 10, 100, 1000, 1000, 10000, 100000]}]
scores = ['f1']

In [None]:
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = model_selection.GridSearchCV(svm.SVC(class_weight="balanced"), tuned_parameters, cv=model_selection.StratifiedKFold(n_splits=10),
                       scoring='%s_macro' % score, n_jobs=-1)
    clf.fit(features_train, labels_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = labels_test, clf.predict(features_test)
    print(metrics.classification_report(y_true, y_pred))
    print()