In [250]:
#from twitter_preprocess import *
import csv
import argparse
import pandas as pd
import numpy as np
import emoji
import re
import string
import os
import statistics
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.naive_bayes import GaussianNB
    
TRAIN = '../data/train/training_data.csv'
train_data = pd.read_csv(TRAIN, index_col=1)
dev_data = pd.read_csv('../data/dev/development_data.csv', index_col=1)
            

In [187]:
def make_bin(curr_dictionary, median):
    swear_bins = [{},{},{}] #bins for < mean, == mean, > mean
    for tweet, count in curr_dictionary.items():
        if count == median:
            swear_bins[1][tweet] = count
        elif count < median:
            swear_bins[0][tweet] = count
        else:
            swear_bins[2][tweet] = count
    return swear_bins

In [188]:
# get unigram counts for data
def get_unigrams_splitBySpace():
    unigrams = Counter()
    tweets = train_data[['tweet']]
    for row_index, row in tweets.iterrows():
        s = row['tweet'].split()
        for word in s:
            unigrams[word] += 1
    return unigrams



In [189]:
sp = get_unigrams_splitBySpace()


In [190]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()



In [191]:
# get unigram counts for data
def get_unigrams_nltkTokenizer():
    uni = Counter()
    tweets = train_data[['tweet']]
    for row_index, row in tweets.iterrows():
        s = tknzr.tokenize(row['tweet'])
        for word in s:
            uni[word] += 1  
    return uni

In [192]:
#generate n-gram, note here n = 5
import re
from nltk.util import ngrams

def get_ngrams(n):
    n_grams = Counter()
    tweets = train_data[['tweet']]
    for row_index, row in tweets.iterrows():
        s = tknzr.tokenize(row['tweet'])
        tokens = [token for token in s if token != ""]
        output = list(ngrams(tokens, n))
        n_grams = Counter(output)
    return n_grams


In [193]:
#Cannot bin
def get_bigrams():
    bigrams = Counter()
    tweets = train_data[['tweet']]
    start = "<s>"
    end = "</s>"
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        word = start + " " + s[0]
        bigrams[word] += 1
        for i in range(len(s)-1):
            word = s[i] + " " + s[i+1]
            bigrams[word] += 1
        word = s[len(s) - 1] + " " + end
        bigrams[word] += 1
    #print(bigrams)
    return bigrams

In [194]:
#Cannot Bin.
#Gets the average word counts for data
#maybe split tweet on more than just whitespace ie ;:,.')(
# def get_avg_wc():
#     wcs = {}
#     tweets = train_data[['tweet']]
#     for row_index,row in tweets.iterrows():
#         s = row['tweet'].split()
#         tot = 0.
#         for word in s:
#             if "http://" in word: continue #ignore hyperlinks
#             tot += len(word)
#         wcs[' '.join(s)] = tot / len(s)
#     return wcs


In [216]:
#Gets the count of '@'s in the data
def get_at_counts():
    ats = {}
    tweets = train_data[['tweet']]
    at_sum = 0
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        count = sum(map(lambda word : 1 if '@' in word else 0, s))
        at_sum += count
        ats[' '.join(s)] = count
        
    val_list = list(ats.values())
    val_list.sort()
    median = statistics.median(val_list) #Get Median for binning
    
    return make_bin(ats, median)

In [184]:
#Gets the count of swear words in tweets
#potentially use regex's to catch purposeful mispellings & other nuances
def get_swear_counts():
    tweets = train_data[['tweet']]
    bad_words_set = set(open("bad-words.txt").read().split())
    bad_words_count = {}
    swear_sum = 0
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        tot_bad = 0
        for word in s:
            word = word.replace(".","").replace(",","").replace("!","").replace("?","").replace(";","")
            if word.lower() in bad_words_set:
                tot_bad+=1
        swear_sum+= tot_bad
        bad_words_count[" ".join(s)] = tot_bad
        
    val_list = list(bad_words_count.values())
    val_list.sort()
    median = statistics.median(val_list) #Get Median for binning
    
    return make_bin(bad_words_count, median)

In [181]:
def contains_mention():
    mentions = {}
    tweets = train_data[['tweet']]
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        res = False
        for word in s:
            if '@' in word:
                mentions[' '.join(s)] = 1
                res = True
        if not res:
            mentions[' '.join(s)] = 0
    return mentions

In [210]:
def get_misspelling_counts():
    tweets = train_data[['tweet']]
    words_set = set(open("all_words.txt").read().split())
    words_set = set(item.lower() for item in words_set)  #Convert words to all lowercase
    misspell_count = {}
    misspell_sum = 0
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        tot_misspelled = 0
        for word in s:
            word = word.replace(".","").replace(",","").replace("!","").replace("?","").replace(";","")
            if word.lower() not in words_set:
                tot_misspelled+=1
        misspell_sum += tot_misspelled
        misspell_count[" ".join(s)] = tot_misspelled
    
    val_list = list(misspell_count.values())
    val_list.sort()
    median = statistics.median(val_list) #Get Median for binning
    
    return make_bin(misspell_count, median)

In [214]:
def get_hashtag_counts():
    hashtags_count = {}
    tweets = train_data[['tweet']]
    at_sum = 0
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        count = sum(map(lambda word : 1 if '#' in word else 0, s))
        at_sum += count
        hashtags_count[' '.join(s)] = count
    val_list = list(hashtags_count.values())
    val_list.sort()
    median = statistics.median(val_list) #Get Median for binning
    
    return make_bin(hashtags_count, median)

In [234]:
def contains_hashtag():
    hashtags = {}
    tweets = train_data[['tweet']]
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        s_str = ' '.join(s)
        if '#' in s_str:
            hashtags[s_str] = 1
            continue
        hashtags[s_str] = 0
    return hashtags

In [235]:
#Most samples evaluate to 0 
def contains_more_uppercase():
    maj_uppercase = {}
    tweets = train_data[['tweet']]
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        s_str = ' '.join(s)
        total_caps = sum(map(lambda ch : 1 if ch.isupper() else 0, s_str))
        if total_caps > len(s_str) // 2:
            maj_uppercase[s_str] = 1
            continue
        maj_uppercase[s_str] = 0
    return maj_uppercase

In [246]:
def has_consecutive_punc():
    punc = {}
    tweets = train_data[['tweet']]
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        s_str = ' '.join(s)
        res = False
        for word in s:
            if 'http://' in word: continue
            for i in range(len(word)-1):
                if word[i] in string.punctuation and word[i+1] in string.punctuation:
                    punc[s_str] = 1
                    res = True
        if not res:
            punc[s_str] = 0
    return punc

In [254]:
#NOT WORKING YET
# def contains_emoji():
#     has_emoji = {}
#     tweets = train_data[['tweet']]
#     for row_index,row in tweets.iterrows():
#         s = row['tweet'].split()
#         s_str = ' '.join(s)
#         res = False
#         print (emoji.UNICODE_EMOJI)
#         for emoji in emoji.UNICODE_EMOJI:
#             if emoji in s_str:
#                 has_emoji[s_str] = 1
#                 res = True
#                 break
#         if not res:
#             has_emoji[s_str] = 0
#     return has_emoji

In [255]:
#unigram_counts = get_unigrams_splitBySpace()
#unis = get_ngrams(1)
#bigrams = get_ngrams(2)
#trigrams = get_ngrams(3)
#uni_tokenizer_counts = get_unigrams_nltkTokenizer()
#bigram_counts = get_bigrams()
#avg_wc = get_avg_wc()
#get_at_counts()
#get_swear_counts()
#contains_mention()
#get_misspelling_counts()
#get_hashtag_counts()
#contains_more_uppercase()
#contains_hashtag()
#has_consecutive_punc()
#contains_emoji()    #NOT WORKING YET


UnboundLocalError: local variable 'emoji' referenced before assignment

In [186]:
#get top 100 unigrams and bigrams
top_unigrams = unis.most_common(100)
top_bigrams = bigrams.most_common(100)
top_trigrams = trigrams.most_common(100)
#top_unigrams = unigram_counts.most_common(100)
#top_tokenized = uni_tokenizer_counts.most_common(100)
#top_bigrams = bigram_counts.most_common(100)
data = pd.concat([train_data, dev_data])

NameError: name 'unis' is not defined

In [29]:
def process_ngram_tweets(tweets, model):
    for word in [u[0] for u in model]:
        if len(word) > 1:
            tweets[word] = tweets['tweet'].str.contains(word).astype(int)
    word_counts = []
    swear_counts = []
    at_counts = []
    bad_words_set = set(open("bad-words.txt").read().split())

    for tweet in tweets['tweet']:
        tweet_words = tweet.split()
        word_counts.append(len(tweet))
        tot_bad = 0
        for word in tweet:            
            if word.lower() in bad_words_set:
                tot_bad+=1
        swear_counts.append(tot_bad)
        at_count = tweet.count('@')
        at_counts.append(at_count)

    tweets['Word Counts'] = word_counts
    tweets['Swear Counts'] = swear_counts
    tweets['@ Counts'] = at_counts
    return tweets[[col for col in tweets.columns if col!="tweet"]].values

In [31]:
tweets = data[['tweet']]
X_uni = process_ngram_tweets(tweets, "top_unigrams")
X_bi = process_ngram_tweets(tweets, "top_bigrams")
X_tri = process_ngram_tweets(tweets, "top_trigrams")
y = data['class'].values


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [44]:

def eval_model(model):
    sample_size = len(model)
    if sample_size > 10:
        cv = KFold(n_splits=10, random_state=42, shuffle=False)
    else: 
        cv = KFold(n_splits=3, random_state=42, shuffle=False)
    LR_scores = []
    SVM_scores = []

    for train_index, test_index in cv.split(model):
        LR = LogisticRegression(fit_intercept=True, max_iter=1000, solver='lbfgs', multi_class='ovr')
        svm = SVC(gamma='auto') 
        X_train, X_test, y_train, y_test = model[train_index], model[test_index], y[train_index], y[test_index]
        LR.fit(X_train, y_train)
        LR_scores.append(LR.score(X_test, y_test))
        svm.fit(X_train, y_train)
        SVM_scores.append(svm.score(X_test, y_test))
    return LR_scores, SVM_scores


In [45]:
LR_uni, SVM_uni = eval_model(X_uni)
LR_bi, SVM_bi =eval_model(X_bi)
LR_tri, SVM_tri =eval_model(X_tri)

In [46]:
LR_uni 

[0.7628262826282628,
 0.7677767776777678,
 0.7781278127812782,
 0.7812781278127813,
 0.7712742008104457,
 0.7573165240882486,
 0.7730751913552454,
 0.7807294011706438,
 0.7726249437190454,
 0.7879333633498424]

In [47]:
SVM_uni


[0.7637263726372637,
 0.77002700270027,
 0.7772277227722773,
 0.7817281728172817,
 0.7726249437190454,
 0.7573165240882486,
 0.7730751913552454,
 0.7816298964430437,
 0.7739756866276452,
 0.7888338586222422]

In [48]:
LR_bi, SVM_bi

([0.7628262826282628,
  0.7677767776777678,
  0.7781278127812782,
  0.7812781278127813,
  0.7712742008104457,
  0.7573165240882486,
  0.7730751913552454,
  0.7807294011706438,
  0.7726249437190454,
  0.7879333633498424],
 [0.7637263726372637,
  0.77002700270027,
  0.7772277227722773,
  0.7817281728172817,
  0.7726249437190454,
  0.7573165240882486,
  0.7730751913552454,
  0.7816298964430437,
  0.7739756866276452,
  0.7888338586222422])

In [49]:
LR_tri, SVM_tritop_unigrams = unigram_counts.most_common(100)
top_tokenized = uni_tokenizer_counts.most_common(100)
data = pd.concat([train_data, dev_data])
top_bigrams = bigram_counts.most_common(100)
data['class']

([0.7628262826282628,
  0.7677767776777678,
  0.7781278127812782,
  0.7812781278127813,
  0.7712742008104457,
  0.7573165240882486,
  0.7730751913552454,
  0.7807294011706438,
  0.7726249437190454,
  0.7879333633498424],
 [0.7637263726372637,
  0.77002700270027,
  0.7772277227722773,
  0.7817281728172817,
  0.7726249437190454,
  0.7573165240882486,
  0.7730751913552454,
  0.7816298964430437,
  0.7739756866276452,
  0.7888338586222422])

In [267]:
def process_tweets(tweets):
    for word in [u[0] for u in top_unigrams]:
        tweets[word] = tweets['tweet'].str.contains(word).astype(int)
    #print(tweets)
    word_counts = []
    swear_counts = []
    at_counts = []
    bad_words_set = set(open("bad-words.txt").read().split())

    for tweet in tweets['tweet']:
        tweet_words = tweet.split()
        word_counts.append(len(tweet_words))
        tot_bad = 0
        #print(tweet_words)
        for word in tweet_words:                #Use regexs? 
            word = word.replace(".","").replace(",","").replace("!","").replace("?","")
            if word.lower() in bad_words_set:
                tot_bad+=1
        swear_counts.append(tot_bad)
        at_count = tweet_words.count('@')
        at_counts.append(at_count)

    tweets['Word Counts'] = word_counts
    tweets['Swear Counts'] = swear_counts
    tweets['@ Counts'] = at_counts
    X = tweets[[col for col in tweets.columns if col!="tweet"]].values
    return X

In [291]:
def process_tokenized_tweets(tweets):
    for word in [u[0] for u in top_tokenized]:
        if len(word) > 1:
            tweets[word] = tweets['tweet'].str.contains(word).astype(int)
    word_counts = []
    swear_counts = []
    at_counts = []
    bad_words_set = set(open("bad-words.txt").read().split())

    for tweet in tweets['tweet']:
        tweet_words = tweet.split()
        word_counts.append(len(tweet))
        tot_bad = 0
        for word in tweet:                #Use regexs? 
            #word = word.replace(".","").replace(",","").replace("!","").replace("?","")
            if word.lower() in bad_words_set:
                tot_bad+=1
        swear_counts.append(tot_bad)
        at_count = tweet.count('@')
        at_counts.append(at_count)

    tweets['Word Counts'] = word_counts
    tweets['Swear Counts'] = swear_counts
    tweets['@ Counts'] = at_counts
    X_tokenized = tweets[[col for col in tweets.columns if col!="tweet"]].values
    return X_tokenized

In [285]:
tweets = data[['tweet']]

In [292]:
X = process_tweets(tweets)
X_tokenized = process_tokenized_tweets(tweets)
y = data['class'].values

In [293]:
X_tokenized

array([[1, 0, 1, ..., 1, 1, 0],
       [1, 0, 0, ..., 1, 1, 0],
       [1, 1, 0, ..., 1, 1, 0],
       ...,
       [1, 0, 0, ..., 1, 1, 0],
       [1, 0, 1, ..., 1, 1, 0],
       [0, 0, 1, ..., 1, 1, 1]])

In [18]:
cv = KFold(n_splits=10, random_state=42, shuffle=False)
LR_scores = []
SVM_scores = []
NB_scores = []

for train_index, test_index in cv.split(X):
    LR = LogisticRegression(fit_intercept=True, max_iter=1000, solver='lbfgs', multi_class='ovr')
    svm = SVC(gamma='auto') 
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
    
    LR.fit(X_train, y_train)
    y_LR_predict = LR.predict(X_test)
    LR_scores.append(precision_recall_fscore_support(y_test, y_LR_predict, average='macro', labels=[0, 1, 2]))
                     
    svm.fit(X_train, y_train)
    y_svm_predict = svm.predict(X_test)
    SVM_scores.append(precision_recall_fscore_support(y_test, y_svm_predict, average='macro', labels=[0, 1, 2]))
    
    gnb = GaussianNB()
    y_NB_predict = gnb.fit(X_train, y_train).predict(X_test)
    NB_scores.append(precision_recall_fscore_support(y_test, y_NB_predict, average='macro', labels=[0, 1, 2]))
#naive bayes

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [20]:
def get_avg_precision(results):
    total = 0
    num_rows = 0
    for row in results:
        total += row[0]
        num_rows += 1
    return total / num_rows

In [294]:
cv = KFold(n_splits=10, random_state=42, shuffle=False)
LR_scores_tok = []
SVM_scores_tok = []

for train_index, test_index in cv.split(X_tokenized):
    LR = LogisticRegression(fit_intercept=True, max_iter=1000, solver='lbfgs', multi_class='ovr')
    svm = SVC(gamma='auto') 
    X_train, X_test, y_train, y_test = X_tokenized[train_index], X_tokenized[test_index], y[train_index], y[test_index]
    LR.fit(X_train, y_train)
    LR_scores_tok.append(LR.score(X_test, y_test))
    svm.fit(X_train, y_train)
    SVM_scores_tok.append(svm.score(X_test, y_test))
    

In [295]:
LR_scores_tok

[0.8361836183618362,
 0.8352835283528353,
 0.851935193519352,
 0.873987398739874,
 0.8536695182350292,
 0.8631247185952273,
 0.8572714993246285,
 0.8712291760468257,
 0.8635749662314273,
 0.8703286807744259]

In [273]:
LR_scores

[0.8618361836183618,
 0.8550855085508551,
 0.8672367236723673,
 0.8861386138613861,
 0.8716794236830256,
 0.875281404772625,
 0.8779828905898244,
 0.8928410625844214,
 0.8829356145880234,
 0.8793336334984241]

In [296]:
SVM_scores_tok 

[0.8186318631863186,
 0.8181818181818182,
 0.833033303330333,
 0.8424842484248425,
 0.819900945520036,
 0.8212516884286357,
 0.829806393516434,
 0.845114813147231,
 0.820351193156236,
 0.8460153084196308]

In [274]:
SVM_scoresLR_precision = get_avg_precision(LR_scores)
SVM_precision = get_avg_precision(SVM_scores)
NB_precision = get_avg_precision(NB_scores)
LR_precision, SVM_precision, NB_precision

[0.8604860486048604,
 0.8577857785778578,
 0.864986498649865,
 0.8771377137713772,
 0.8680774425934263,
 0.8671769473210266,
 0.8766321476812247,
 0.884736605132823,
 0.8757316524088249,
 0.8833858622242233]