In [4]:
import csv
import argparse
import pandas as pd
import numpy as np
import re
import string
import os
import statistics
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.naive_bayes import GaussianNB
    
TRAIN = '../data/train/training_data.csv'
train_data = pd.read_csv(TRAIN, index_col=1)
dev_data = pd.read_csv('../data/dev/development_data.csv', index_col=1)

tweets = train_data[['tweet']]

## Features

In [14]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

Unigrams for the data splitting on spaces

In [46]:
def get_unigrams():
    unigrams = Counter()
    for row_index, row in tweets.iterrows():
        s = row['tweet'].split()
        for word in s:
            unigrams[word] += 1
    return unigrams

Make count features binary by finding median values over entire dataset

In [60]:
#split into quartiles?
def find_quartile_values(counts):
    counts.sort()
    median = statistics.median(counts)
    counts.find(median)
    upper_half = counts[median:]
    lower_half = counts[:median]
    lower_median = statistics.median(lower_half)
    upper_median = statistics.median(upper_half)
    return [0, lower_median, median, upper_median]
    

In [5]:
def make_at_bins():
    at_counts = []
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        count = sum(map(lambda word : 1 if '@' in word else 0, s))
        at_counts.append(count)
        
    at_counts.sort()
    median = statistics.median(at_counts) #Get Median for binning
    
    #return list of 0 and median to split values into above or below the median
    return [0, median]

In [6]:
def make_num_token_bins():
    lens = []
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        count = len(s)
        lens.append(count)
        
    lens.sort()
    median = statistics.median(lens) #Get Median for binning
    
    return [0, median]

In [7]:
def make_swear_bins():
    bad_words_set = set(open("bad-words.txt").read().split())
    bad_words_counts = []
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        tot_bad = 0
        for word in s:
            word = word.replace(".","").replace(",","").replace("!","").replace("?","").replace(";","")
            if word.lower() in bad_words_set:
                tot_bad+=1
        bad_words_counts.append(tot_bad)
        
    bad_words_counts.sort()
    median = statistics.median(bad_words_counts) #Get Median for binning
    
    return [0, median]

In [9]:
def make_mention_bins():
    mentions = []
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        count = sum(map(lambda word : 1 if '@' in word else 0, s))
        lens.append(count)
        
    mentions.sort()
    median = statistics.median(mentions) #Get Median for binning
    
    return [0, median]

In [10]:
def make_hashtag_bins():
    hashtag_counts = []
    at_sum = 0
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        count = sum(map(lambda word : 1 if '#' in word else 0, s))
        hashtag_counts.append(count)
    
    hashtag_counts.sort()
    median = statistics.median(hashtag_counts) #Get Median for binning
    
    return [0, median]

Bins contain the splits for which bin a tweet's feature counts will land in

In [11]:
num_token_bins = make_num_token_bins()
swear_bins = make_swear_bins()
at_bins = make_at_bins()
hashtag_bins = make_hashtag_bins()

In [47]:
unigram_counts = get_unigrams()

Find most common unigrams

In [48]:
top_unigrams = unigram_counts.most_common(100)

In [18]:
def find_bin(count, bin_name):
    for i in range(len(bin_name)):
        if count < bin_name[i]:
            return i
    return len(bin_name)

Process the tweets by getting their feature representations

In [49]:
def process_tweets(tweets):
    for word in [u[0] for u in top_unigrams]:
        tweets[word] = tweets['tweet'].str.contains(word).astype(int)

    word_counts = []
    swear_counts = []
    at_counts = []
    contains_at = []
    hashtag_counts = []
    contains_hashtag = []
    bad_words_set = set(open("bad-words.txt").read().split())

    for tweet in tweets['tweet']:
        tweet_words = tweet.split()
        num_token_bin = find_bin(len(tweet_words), num_token_bins)
        word_counts.append(num_token_bin)
        tot_bad = 0

        for word in tweet_words:                #Use regexs? 
            word = word.replace(".","").replace(",","").replace("!","").replace("?","")
            if word.lower() in bad_words_set:
                tot_bad+=1
        swear_bin = find_bin(tot_bad, swear_bins)
        swear_counts.append(swear_bin)
        at_count = tweet_words.count('@')
        
        if at_count > 0:
            contains_at.append(1)
        else:
            contains_at.append(0)
        at_bin = find_bin(at_count, at_bins)
        at_counts.append(at_bin)
        
        hash_count = tweet_words.count('#')
        if hash_count > 0:
            contains_hashtag.append(1)
        else:
            contains_hashtag.append(0)
        hash_bin = find_bin(hash_count, hashtag_bins)
        hashtag_counts.append(hash_bin)

    #tweets['Word Counts'] = word_counts
    tweets['Swear Counts'] = swear_counts
    tweets['@ Counts'] = at_counts
    tweets['Mention'] = contains_at
    tweets['Hashtag Counts'] = hashtag_counts
    tweets['Contains Hashtag'] = contains_hashtag
    X = tweets[[col for col in tweets.columns if col!="tweet"]].values
    return X

Combine train and dev sets for k-fold cross validation

In [51]:
data = pd.concat([train_data, dev_data], sort=False)
tweets = data[['tweet']]

In [52]:
X = process_tweets(tweets)
y = data['class'].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


## Baseline Models

In [63]:
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
cv = KFold(n_splits=10, random_state=42, shuffle=False)

Run 10-fold cross validation on combined training and dev sets on LR, SVM, and NB models

In [65]:
LR = LogisticRegression(fit_intercept=True, max_iter=1000, solver='lbfgs', multi_class='ovr')
lr_y_pred = cross_val_predict(LR, X, y, cv=cv)

In [66]:
print("LR:", classification_report(y, lr_y_pred))

LR:              precision    recall  f1-score   support

          0       0.37      0.03      0.05      1282
          1       0.91      0.95      0.93     17186
          2       0.73      0.82      0.77      3746

avg / total       0.85      0.87      0.85     22214



In [58]:
svm = SVC(gamma='auto') 
svm_y_pred = cross_val_predict(svm, X, y, cv=cv)
print(classification_report(y, svm_y_pred))

             precision    recall  f1-score   support

          0       0.32      0.01      0.02      1282
          1       0.89      0.96      0.92     17186
          2       0.75      0.73      0.74      3746

avg / total       0.83      0.86      0.84     22214



In [59]:
gnb = GaussianNB()
nb_y_pred = cross_val_predict(gnb, X, y, cv=cv)
print(classification_report(y, nb_y_pred))

             precision    recall  f1-score   support

          0       0.05      0.28      0.09      1282
          1       0.97      0.46      0.63     17186
          2       0.49      0.97      0.65      3746

avg / total       0.84      0.54      0.60     22214

