In [14]:
import csv
import argparse
import pandas as pd
import numpy as np
import re
import string
import os
import statistics
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.naive_bayes import GaussianNB
    
TRAIN = '../data/train/training_data.csv'
train_data = pd.read_csv(TRAIN, index_col=1)
dev_data = pd.read_csv('../data/dev/development_data.csv', index_col=1)

tweets = train_data[['tweet']]

In [None]:
bad_words_set = set(open("bad-words.txt").read().split())
words_set = set(open("all_words.txt").read().split())
words_set = set(item.lower() for item in words_set)

## Features

In [15]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

Unigrams for the data splitting on spaces

In [16]:
def get_unigrams():
    unigrams = Counter()
    for row_index, row in tweets.iterrows():
        s = row['tweet'].split()
        for word in s:
            unigrams[word] += 1
    return unigrams

Make count features binary by finding median values over entire dataset

In [45]:
#split into quartiles?
def find_quartile_values(counts):
    counts.sort()
    first = np.quantile(counts, .25)
    second = np.quantile(counts, .5)
    third = np.quantile(counts, .75)
    return [first, second, third] 

In [46]:
def make_at_bins():
    at_counts = []
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        count = sum(map(lambda word : 1 if '@' in word else 0, s))
        at_counts.append(count)
    return find_quartile_values(at_counts)

In [48]:
def make_num_token_bins():
    lens = []
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        count = len(s)
        lens.append(count)
    
    return find_quartile_values(lens)
#     lens.sort()
#     median = statistics.median(lens) #Get Median for binning
    
#     return [0, median]

In [49]:
def make_swear_bins():
    bad_words_counts = []
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        tot_bad = 0
        for word in s:
            word = word.replace(".","").replace(",","").replace("!","").replace("?","").replace(";","")
            if word.lower() in bad_words_set:
                tot_bad+=1
        bad_words_counts.append(tot_bad)
    
    return find_quartile_values(bad_words_counts)
#     bad_words_counts.sort()
#     median = statistics.median(bad_words_counts) #Get Median for binning
    
#     return [0, median]

In [50]:
def make_mention_bins():
    mentions = []
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        count = sum(map(lambda word : 1 if '@' in word else 0, s))
        lens.append(count)
    
    return find_quartile_values(mentions)
#     mentions.sort()
#     median = statistics.median(mentions) #Get Median for binning
    
#     return [0, median]

In [51]:
def make_hashtag_bins():
    hashtag_counts = []
    at_sum = 0
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        count = sum(map(lambda word : 1 if '#' in word else 0, s))
        hashtag_counts.append(count)
    
    return find_quartile_values(hashtag_counts)
#     hashtag_counts.sort()
#     median = statistics.median(hashtag_counts) #Get Median for binning
    
#     return [0, median]

In [54]:
def make_misspelling_bins():
    misspell_counts = []
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        tot_misspelled = 0
        for word in s:
            word = word.replace(".","").replace(",","").replace("!","").replace("?","").replace(";","")
            if word.lower() not in words_set:
                tot_misspelled+=1
        misspell_counts.append(tot_misspelled)
    
    return find_quartile_values(misspell_counts)
#     misspell_counts.sort()
#     median = statistics.median(misspell_counts)
#     return [0, median]

Bins contain the splits for which bin a tweet's feature counts will land in

In [55]:
num_token_bins = make_num_token_bins()
swear_bins = make_swear_bins()
at_bins = make_at_bins()
hashtag_bins = make_hashtag_bins()
misspell_bins =  make_misspelling_bins()

FileNotFoundError: [Errno 2] No such file or directory: 'all_words.txt'

More binary features

In [1]:
def contains_more_uppercase(tweet):
    total_caps = sum(map(lambda ch : 1 if ch.isupper() else 0, tweet))
    if total_caps > len(s_str) // 2:
        return 1
    return 0

In [2]:
def has_consecutive_punc(tweet):
    for word in tweet.split():
        if 'http://' in word: continue
        for i in range(len(word)-1):
            if word[i] in string.punctuation and word[i+1] in string.punctuation:
                return 1
    return 0

In [4]:
third_person = ['he', 'she', 'they', 'him', 'her', 'them', 'his', 'hers', 'their', 'theirs', 'themselves', 'himself', 'herself']
second_person = ['you', 'your', 'yours']
first_person =['i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours']

In [6]:
# Find if first word in tweet is a third person pronoun. Also check if number of third person pronouns is greater
# than first person pronouns
def get_pronouns(tweet):
    first_pronoun = 0
    third_and_second_greater_than_first = 0
    words = tweet.split()
    if words[0] in third_person or words[0] in second_person: 
        first_pronoun = 1
    
    first_person_count = 0
    third_second_person_count = 0
    for word in words:
        if word in third_person or word in second_person:
            third_second_person_count += 1
        elif word in first_person:
            first_person_count += 1
            
    if first_person_count < third_second_person_count:
        third_and_second_greater_than_first = 1
       
    return first_pronoun, third_and_second_greater_than_first

Cite lexicon:
    Minqing Hu and Bing Liu. "Mining and Summarizing Customer Reviews." 
        Proceedings of the ACM SIGKDD International Conference on Knowledge 
        Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle, 
        Washington, USA,

In [10]:
pos_words = set(open("opinion-lexicon-English/positive-words.txt").read().split())
neg_words = set(open("opinion-lexicon-English/negative-words.txt").read().split())

In [11]:
#returns 1, 1 if negative words outnumber positive words and there are no positive words
def get_sentiment(tweet):
    pos_count = 0
    neg_count = 0
    for word in tweet.split():
        if word in pos_words:
            pos_count += 1
        if word in neg_words:
            neg_count += 1
    if neg_count > pos_count:
        if pos_count > 0:
            return 1, 0
        else:
            return 1, 1
    else:
        if pos_count > 0:
            return 0, 0
        else:
            return 0, 1

In [13]:
#returns one if a word in the tweet has non alphanumeric characters (not including punctuation at the end of a word)
def contains_non_alphanum(tweet):
    words = tweet.split()
    for word in words:
        if not string.isalnum(word):
            if word[-1] not in string.punctuation:
                return 1
    return 0

Find most common unigrams

In [13]:
unigram_counts = get_unigrams()
top_unigrams = unigram_counts.most_common(100)

In [14]:
#each count feature has four bins: 1 for < 25th percentile, 2 for < 50th percentile, 
#3 for < 75th percentile, and 4 for < 100th percentile
def find_bin(count, bin_name):
    for i in range(len(bin_name)):
        if count < bin_name[i]:
            return i + 1
    return len(bin_name)

Process the tweets by getting their feature representations

In [12]:
def process_tweets(tweets):
#     for word in [u[0] for u in top_unigrams]:
#         tweets[word] = tweets['tweet'].str.contains(word).astype(int)
    word_counts = []
    swear_counts = []
    at_counts = []
    contains_at = []
    hashtag_counts = []
    contains_hashtag = []
    consecutive_punc = []
    more_upper = []
    first_pronoun = []
    fewer_first_person = []
    more_negative = []
    no_positive = []
    contains_url = []
    not_alphanum = []
    misspellings = []
    
    for tweet in tweets['tweet']:
        #count tokens
        tweet_words = tweet.split()
        num_token_bin = find_bin(len(tweet_words), num_token_bins)
        word_counts.append(num_token_bin)
        
        #count swear words
        tot_bad = 0
        for word in tweet_words:                #Use regexs? 
            word = word.replace(".","").replace(",","").replace("!","").replace("?","")
            if word.lower() in bad_words_set:
                tot_bad+=1
        swear_bin = find_bin(tot_bad, swear_bins)
        swear_counts.append(swear_bin)
        
        #count mentions
        at_count = tweet.count('@')
        if at_count > 0:
            contains_at.append(1)
        else:
            contains_at.append(0)
        at_bin = find_bin(at_count, at_bins)
        at_counts.append(at_bin)
        
        #count hashtags
        hash_count = tweet.count('#')
        if hash_count > 0:
            contains_hashtag.append(1)
        else:
            contains_hashtag.append(0)
        hash_bin = find_bin(hash_count, hashtag_bins)
        hashtag_counts.append(hash_bin)
        
        #count misspellings
        misspell_count = 0
        for word in tweet_words:
            word = word.replace(".","").replace(",","").replace("!","").replace("?","").replace(";","")
            if word.lower() not in words_set:
                misspell_count+=1
        misspell_bin = find_bin(misspell_count, misspell_bins)
        miesspellings.append(misspell_bin)
        
        more_upper.append(has_more_upper(tweet))
        consecutive_punc.append(has_consecutive_punc(tweet))
        first, more = get_pronouns(tweet)
        first_pronoun.append(first)
        fewer_first_person.append(more)
        
        more_neg, pos = get_sentiment(tweet)
        more_negative.append(more_neg)
        no_positive.append(pos)
        
        if 'http://' in tweet:
            contains_url.append(1)
        else:
            contains_url.append(0)
            
        not_alphanum.append(contains_non_alphanum(tweet))

    tweets['Word Counts'] = word_counts
    tweets['Swear Counts'] = swear_counts
    tweets['@ Counts'] = at_counts
    tweets['Mention'] = contains_at
    tweets['Hashtag Counts'] = hashtag_counts
    tweets['Contains Hashtag'] = contains_hashtag
    X = tweets[[col for col in tweets.columns if col!="tweet"]].values
    return X

Combine train and dev sets for k-fold cross validation

In [39]:
data = pd.concat([train_data, dev_data], sort=False)
tweets = data[['tweet']]

In [47]:
X = process_tweets(tweets)
y = data['class'].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [48]:
X.shape

(22214, 6)

## Baseline Models

In [49]:
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
cv = KFold(n_splits=10, random_state=42, shuffle=False)

Run 10-fold cross validation on combined training and dev sets on LR, SVM, and NB models

In [50]:
LR = LogisticRegression(fit_intercept=True, max_iter=1000, solver='lbfgs', multi_class='ovr')
lr_y_pred = cross_val_predict(LR, X, y, cv=cv)

In [51]:
print("LR:", classification_report(y, lr_y_pred))

LR:               precision    recall  f1-score   support

           0       0.00      0.00      0.00      1282
           1       0.88      0.88      0.88     17186
           2       0.55      0.74      0.63      3746

   micro avg       0.81      0.81      0.81     22214
   macro avg       0.48      0.54      0.51     22214
weighted avg       0.78      0.81      0.79     22214



  'precision', 'predicted', average, warn_for)


In [52]:
svm = SVC(gamma='auto') 
svm_y_pred = cross_val_predict(svm, X, y, cv=cv)
print(classification_report(y, svm_y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1282
           1       0.88      0.88      0.88     17186
           2       0.55      0.74      0.63      3746

   micro avg       0.81      0.81      0.81     22214
   macro avg       0.48      0.54      0.51     22214
weighted avg       0.78      0.81      0.79     22214



  'precision', 'predicted', average, warn_for)


In [53]:
gnb = GaussianNB()
nb_y_pred = cross_val_predict(gnb, X, y, cv=cv)
print(classification_report(y, nb_y_pred))

              precision    recall  f1-score   support

           0       0.06      0.81      0.11      1282
           1       0.73      0.00      0.00     17186
           2       0.55      0.74      0.63      3746

   micro avg       0.17      0.17      0.17     22214
   macro avg       0.45      0.52      0.25     22214
weighted avg       0.66      0.17      0.12     22214

