In [93]:
import csv
import numpy as np
import sklearn.feature_extraction as sk
from sklearn.ensemble import RandomForestClassifier as RFC
import re

In [94]:
# Parse data from csv file
# [name] is name of file
# [hasLabels] is whether or not this csv includes labels (the test data does not)
# returns
#    list of strings containing tweet texts,
#    corresponding list of labels [1,-1] for tweets
def parse_file(name, hasLabels = True):
    text_list = []
    yTr = []

    with open(name) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        linecount = 0
        for row in csv_reader:
            if linecount == 0:
                #print(row)
                linecount += 1
            else:
                text = row[1]
                text_list.append(text)
                #if "@" in text:
                #    result = re.match(pattern, string)
                #    print("Yes")
                if hasLabels:
                    yTr.append(int(row[17]))
                linecount += 1
        print(f'Processed {linecount} lines.')
    return text_list, yTr

# Split lists for train-test-validation splitting
# splits lists [X] and [Y] into two separate shuffled lists each
# first set contains [frac] proportion of the elements
# returns: firstX, firstY, secondX, secondY
def split_data(frac, X, Y):
    n = len(X)
    sort = np.random.permutation(n)
    split = int(frac * n)
    Xsort = np.array(X)[sort]
    Ysort = np.array(Y)[sort]
    return Xsort[0:split], Ysort[0:split], Xsort[split:], Ysort[split:]

# Create submission csv with file name [name] for predicted values [predicted]
def write_submission(name, predicted):
    with open(name, mode='w') as submission:
        sub_writer = csv.writer(submission, delimiter=',', lineterminator='\n')
        sub_writer.writerow(['ID', 'Label'])

        for i in range(len(predicted)):
            sub_writer.writerow([i,predicted[i]])

# For pre-processing on the data set, element-wise modify each text string
def modify_text(text_list, yTr, mod_func):
    new_list = []
    for i in range(len(text_list)):
        new_list.append(mod_func(text_list[i], yTr[i]))
    return new_list

http_pattern = r".*http[^ \t\n\r\f\v]+[ \t\n\r\f\v]*"

# prune out http entirely
def prune_http(text_list, yTr):
    return modify_text(text_list, yTr, lambda s, y: re.sub(http_pattern, '', s))

# replace http with a special word
def replace_all_http(text_list, yTr):
    return modify_text(text_list, yTr, lambda s, y: re.sub(http_pattern, 'IS_EMBED', s))

# final decision on how to pre-process the data, replaced only the final hash chars of the shortened link
# https://t.co/... -> https://t.co/
def pre_process_text(text_list):
    return modify_text(train_text, yTr, lambda s, y: re.sub(r"https://t.co/\S+", 'https://t.co/', s))

# Test random forest on pre-computed training and testing data
# [trees] is th enumber of estimators
# [M] is the number of trials to perform
# [progress_output] determines whether or not to print output per trial
def test_random_forest(xTr_counts, yTr, xTe_counts, yTe, trees=500, M=1, progress_output = True):
    if yTe is None:
        print("Cannot test accuracy – no labels for testing data")
        return
    avg_acc = 0
    accuracies = []
    num_predictions = len(yTe)
    for i in range(M):
        H = RFC(n_estimators=trees).fit(xTr_counts, yTr)
        predicted = H.predict(xTe_counts)
        acc = np.sum(predicted == yTe)/num_predictions
        if progress_output:
            print("Trial {}: {}".format(i+1, acc))
        accuracies.append(acc)
    print("Mean: {}, Std.dev: {}".format(np.mean(accuracies), np.std(accuracies)))
    
# Test random forest on random splits of the training data
# [trees] is th enumber of estimators
# [M] is the number of trials to perform
# [progress_output] determines whether or not to print output per trial
# [split] is the proportion of training vs testing
def test_random_forest_full(trees=500, M=1, progress_output = True, split = 0.7):
    train_text, yTr = parse_file("train.csv")
    train_text = modify_text(train_text, yTr, lambda s, y: re.sub(r"https://t.co/\S+", 'https://t.co/', s))
    avg_acc = 0
    accuracies = []
    num_predictions = int((1 - split) * len(yTr))
    for i in range(M):
        train_text0, yTr0, test_text, yTe = split_data(split, train_text, yTr)
        count_vec = sk.text.TfidfVectorizer(analyzer="char",strip_accents="unicode", lowercase=False)#, token_pattern=r'\w{1,}')
        xTr_counts = count_vec.fit_transform(train_text0)
        xTe_counts = count_vec.transform(test_text)
        H = RFC(n_estimators=trees).fit(xTr_counts, yTr0)
        predicted = H.predict(xTe_counts)
        acc = np.sum(predicted == yTe)/num_predictions
        if progress_output:
            print("Trial {}: {}".format(i+1, acc))
        accuracies.append(acc)
    print("Mean: {}, Std.dev: {}".format(np.mean(accuracies), np.std(accuracies)))
    #return accuracies

# A classifier that produces predictions solely based on whether or not the tweet text contains a link
# our baseline that our true predictor must surpass
def cheater_classifier(text_list):
    predictions = []
    for i in range(len(text_list)):
        #print(text_list[i], "|", re.match(http_pattern, text_list[i]))
        predictions.append(-1 if re.match(http_pattern, text_list[i]) else 1)
    return predictions

In [95]:
# train on entire set, test on submission csv
train_text, yTr = parse_file('train.csv')
test_text, yTe = parse_file('test.csv', hasLabels = False)

Processed 1090 lines.
Processed 301 lines.


In [96]:
# train on 70% of set, test on 30%
train_text, yTr = parse_file("train.csv")
train_text, yTr, test_text, yTe = split_data(0.7, train_text, yTr)
#print(np.sum(cheater_classifier(test_text) == yTe)/300)

Processed 1090 lines.


In [97]:
# Create bag of words for training set

train_text0 = pre_process_text(train_text)
#train_text0 = replace_all_http(train_text0, yTr)
#train_text0 = modify_text(train_text0, yTr, lambda s, y: re.sub(r"https://t.co/\S+", 'https://t.co/', s))
#train_text0 = modify_text(train_text0, yTr, lambda s, y: re.sub(r"<\S+>", '∑', s))
#train_text0 = modify_text(train_text0, yTr, lambda s, y: re.sub(r' ', '', s))

#count_vec = sk.text.CountVectorizer(analyzer="char",strip_accents="unicode", lowercase=False)
count_vec = sk.text.TfidfVectorizer(analyzer="char",strip_accents="unicode", lowercase=False)#, token_pattern=r'\w{1,}')
xTr_counts = count_vec.fit_transform(train_text0)  # feature vector
xTr_counts.shape

sum_words = xTr_counts.sum(axis=0)

word_freq = [(word, sum_words[0,idx]) for (word,idx) in count_vec.vocabulary_.items()]
word_freq = sorted(word_freq, key=lambda x:x[1], reverse=True)  # count of each word overall

# Create bag of words for testing set
xTe_counts = count_vec.transform(test_text)  # feature vector

print(xTe_counts.shape)
print(word_freq)

(327, 84)
[(' ', 382.92226057491297), ('e', 199.4582485668008), ('t', 176.0743631154441), ('a', 159.2662491907815), ('o', 155.48287787719113), ('n', 130.6373422071858), ('i', 128.01291619805), ('r', 127.1384183017574), ('s', 103.19368654095491), ('l', 96.56467781008011), ('/', 89.06366513431713), ('h', 84.78415106861591), ('d', 71.06749671783851), ('u', 67.4725027637438), ('p', 65.06493621287572), ('m', 62.545479531046794), ('c', 59.88126541328635), ('y', 56.36582603320831), ('g', 47.237177400515826), ('T', 46.89403514374512), ('w', 45.733235424038156), ('A', 43.6955634292396), ('.', 43.11776026166888), ('b', 37.458477956567314), ('f', 36.856246277612826), ('k', 36.61314656983876), (':', 35.84519755138385), ('#', 34.99448656248796), ('!', 32.47774603272562), ('@', 31.912694490697298), ('0', 28.592668770276216), ('I', 28.40146048712822), ('v', 27.63533756769912), (';', 27.50952537816148), ('M', 26.220559314938992), ('C', 26.07269142779909), ('N', 23.952150397478995), ('S', 23.4233698304

In [98]:
test_random_forest(xTr_counts, yTr, xTe_counts, yTe, trees=1000, M=10)

Trial 1: 0.8654434250764526
Trial 2: 0.8685015290519877
Trial 3: 0.8715596330275229
Trial 4: 0.8654434250764526
Trial 5: 0.8715596330275229
Trial 6: 0.8685015290519877
Trial 7: 0.8685015290519877
Trial 8: 0.8685015290519877
Trial 9: 0.8654434250764526
Trial 10: 0.8685015290519877
Mean: 0.8681957186544341, Std.dev: 0.0021406727828746307


In [340]:
write_submission('link-only-submission.csv', cheater_classifier(test_text))