In [1]:
#from twitter_preprocess import *
import csv
import argparse
import pandas as pd
import numpy as np
import re
import os
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.metrics import f1_score

TRAIN = '../data/train/training_data.csv'
train_data = pd.read_csv(TRAIN, index_col=0)
dev_data = pd.read_csv('../data/dev/development_data.csv', index_col=0)


# get unigram counts for data
def get_unigrams():
    unigrams = Counter()
    tweets = train_data[['tweet']]
    for row_index, row in tweets.iterrows():
        s = row['tweet'].split()
        for word in s:
            unigrams[word] += 1
    return unigrams

# 
def get_bigrams():
    bigrams = Counter()
    tweets = train_data[['tweet']]
    start = "<s>"
    end = "</s>"
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        word = start + " " + s[0]
        bigrams[word] += 1
        for i in range(len(s)-1):
            word = s[i] + " " + s[i+1]
            bigrams[word] += 1
        word = s[len(s) - 1] + " " + end
        bigrams[word] += 1
    #print(bigrams)
    return bigrams


#Gets the average word counts for data
#maybe split tweet on more than just whitespace ie ;:,.')(
def get_avg_wc():
    wcs = {}
    tweets = train_data[['tweet']]
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        tot = 0.
        for word in s:
            if "http://" in word: continue #ignore hyperlinks
            tot += len(word)
        wcs[' '.join(s)] = tot / len(s)
    return wcs


#Gets the count of '@'s in the data
def get_at_counts():
    ats = {}
    tweets = train_data[['tweet']]
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        count = sum(map(lambda word : 1 if '@' in word else 0, s))
        ats[' '.join(s)] = count
            
    return ats


#Gets the count of swear words in tweets
#potentially use regex's to catch purposeful mispellings & other nuances
def get_swear_counts():
    tweets = train_data[['tweet']]
#     bad_word_list = open("en_bad_words.txt").read().replace("\n", "").split(',')
#     for i in range(len(bad_word_list)):
#         bad_word_list[i] = bad_word_list[i].strip(" ")
    bad_words_set = set(open("bad-words.txt").read().split())
    
    bad_words_count = {}
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        tot_bad = 0
        for word in s:                #Use regexs? 
            word = word.replace(".","").replace(",","").replace("!","").replace("?","")
            if word.lower() in bad_words_set:
                tot_bad+=1
            bad_words_count[" ".join(s)] = tot_bad
            
    return bad_words_count
    
            

In [2]:
unigram_counts = get_unigrams()
bigram_counts = get_bigrams()
avg_wc = get_avg_wc()
at_counts = get_at_counts()
swear_counts = get_swear_counts()

In [3]:
#get top 100 unigrams and bigrams
top_unigrams = unigram_counts.most_common(100)
top_bigrams = bigram_counts.most_common(100)
data = pd.concat([train_data, dev_data])

In [4]:
def process_tweets(tweets):
    for word in [u[0] for u in top_unigrams]:
        tweets[word] = tweets['tweet'].str.contains(word).astype(int)
        
    word_counts = []
    swear_counts = []
    at_counts = []
    bad_words_set = set(open("bad-words.txt").read().split())

    for tweet in tweets['tweet']:
        tweet_words = tweet.split()
        word_counts.append(len(tweet_words))
        tot_bad = 0
        for word in tweet_words:                #Use regexs? 
            word = word.replace(".","").replace(",","").replace("!","").replace("?","")
            if word.lower() in bad_words_set:
                tot_bad+=1
        swear_counts.append(tot_bad)
        at_count = tweet_words.count('@')
        at_counts.append(at_count)

    tweets['Word Counts'] = word_counts
    tweets['Swear Counts'] = swear_counts
    tweets['@ Counts'] = at_counts
    X = tweets[[col for col in tweets.columns if col!="tweet"]].values
    return X

In [5]:
tweets = data[['tweet']]
X = process_tweets(tweets)
y = data['class'].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [6]:
cv = KFold(n_splits=10, random_state=42, shuffle=False)
LR_scores = []
SVM_scores = []

for train_index, test_index in cv.split(X):
    LR = LogisticRegression(fit_intercept=True, max_iter=1000, solver='lbfgs', multi_class='ovr')
    svm = SVC(gamma='auto') 
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
    LR.fit(X_train, y_train)
    y_LR_predict = LR.predict(X_test)
    LR_scores.append(f1_score(y_test, y_LR_predict, average='macro'))
    svm.fit(X_train, y_train)
    y_svm_predict = svm.predict(X_test)
    SVM_scores.append(f1_score(y_test, y_svm_predict, average='macro'))
    

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [7]:
LR_scores

[0.5825480204977357,
 0.580861460183061,
 0.5769892279519048,
 0.6039752207974036,
 0.5817626339178062,
 0.6113663503736466,
 0.5802935292455237,
 0.6101370378759232,
 0.6041726614463215,
 0.601037700540643]

In [8]:
SVM_scores

[0.5627030106614033,
 0.5560457320017999,
 0.554447817125726,
 0.5731343592564496,
 0.5623827182294426,
 0.5679167471143542,
 0.569832741551306,
 0.5760983580822172,
 0.5775904211869101,
 0.5681794985221654]