In [93]:
#from twitter_preprocess import *
import csv
import argparse
import pandas as pd
import numpy as np
import re
import os
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

TRAIN = '../data/train/training_data.csv'
train_data = pd.read_csv(TRAIN, index_col=0)
dev_data = pd.read_csv('../data/dev/development_data.csv', index_col=0)


# get unigram counts for data
def get_unigrams():
    unigrams = Counter()
    tweets = train_data[['tweet']]
    for row_index, row in tweets.iterrows():
        s = row['tweet'].split()
        for word in s:
            unigrams[word] += 1
    return unigrams

# 
def get_bigrams():
    bigrams = Counter()
    tweets = train_data[['tweet']]
    start = "<s>"
    end = "</s>"
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        word = start + " " + s[0]
        bigrams[word] += 1
        for i in range(len(s)-1):
            word = s[i] + " " + s[i+1]
            bigrams[word] += 1
        word = s[len(s) - 1] + " " + end
        bigrams[word] += 1
    #print(bigrams)
    return bigrams


#Gets the average word counts for data
#maybe split tweet on more than just whitespace ie ;:,.')(
def get_avg_wc():
    wcs = {}
    tweets = train_data[['tweet']]
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        tot = 0.
        for word in s:
            if "http://" in word: continue #ignore hyperlinks
            tot += len(word)
        wcs[' '.join(s)] = tot / len(s)
    return wcs


#Gets the count of '@'s in the data
def get_at_counts():
    ats = {}
    tweets = train_data[['tweet']]
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        count = sum(map(lambda word : 1 if '@' in word else 0, s))
        ats[' '.join(s)] = count
            
    return ats


#Gets the count of swear words in tweets
#potentially use regex's to catch purposeful mispellings & other nuances
def get_swear_counts():
    tweets = train_data[['tweet']]
#     bad_word_list = open("en_bad_words.txt").read().replace("\n", "").split(',')
#     for i in range(len(bad_word_list)):
#         bad_word_list[i] = bad_word_list[i].strip(" ")
    bad_words_set = set(open("bad-words.txt").read().split())
    
    bad_words_count = {}
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        tot_bad = 0
        for word in s:                #Use regexs? 
            word = word.replace(".","").replace(",","").replace("!","").replace("?","")
            if word.lower() in bad_words_set:
                tot_bad+=1
            bad_words_count[" ".join(s)] = tot_bad
            
    return bad_words_count
    
            

In [19]:
unigram_counts = get_unigrams()
bigram_counts = get_bigrams()
avg_wc = get_avg_wc()
at_counts = get_at_counts()
swear_counts = get_swear_counts()

In [111]:
#get top 100 unigrams and bigrams
top_unigrams = unigram_counts.most_common(100)
top_bigrams = bigram_counts.most_common(100)
data = pd.concat([train_data, dev_data])

In [74]:
def process_tweets(tweets):
    for word in [u[0] for u in top_unigrams]:
        tweets[word] = tweets['tweet'].str.contains(word).astype(int)
        
    word_counts = []
    swear_counts = []
    at_counts = []
    bad_words_set = set(open("bad-words.txt").read().split())

    for tweet in tweets['tweet']:
        tweet_words = tweet.split()
        word_counts.append(len(tweet_words))
        tot_bad = 0
        for word in tweet_words:                #Use regexs? 
            word = word.replace(".","").replace(",","").replace("!","").replace("?","")
            if word.lower() in bad_words_set:
                tot_bad+=1
        swear_counts.append(tot_bad)
        at_count = tweet_words.count('@')
        at_counts.append(at_count)

    tweets['Word Counts'] = word_counts
    tweets['Swear Counts'] = swear_counts
    tweets['@ Counts'] = at_counts
    X = tweets[[col for col in tweets.columns if col!="tweet"]].values
    return X

In [None]:
tweets = data[['tweet']]
X = process_tweets(tweets)
y = data['class'].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [120]:
cv = KFold(n_splits=10, random_state=42, shuffle=False)
scores = []

for train_index, test_index in cv.split(X):
    model = LogisticRegression(
        fit_intercept=True, solver='lbfgs')
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
    print(X_train, y_train)
    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))




[[ 1  0  1 ...  7  1  0]
 [ 1  0  0 ...  9  0  0]
 [ 1  0  1 ...  7  4  0]
 ...
 [ 1  0  0 ... 11  0  0]
 [ 1  0  1 ... 20  2  0]
 [ 0  0  1 ...  8  1  0]] 2222     2.0
2223     2.0
2224     1.0
2225     1.0
2226     2.0
2227     2.0
2228     1.0
2229     0.0
2230     1.0
2231     0.0
2232     NaN
2233     1.0
2234     1.0
2235     1.0
2236     1.0
2237     1.0
2238     1.0
2239     2.0
2240     1.0
2241     NaN
2242     1.0
2243     1.0
2244     1.0
2245     1.0
2246     1.0
2247     1.0
2248     1.0
2249     2.0
2250     1.0
2251     1.0
        ... 
22184    1.0
22185    1.0
22186    1.0
22187    1.0
22188    2.0
22189    2.0
22190    1.0
22191    1.0
22192    1.0
22193    1.0
22194    2.0
22195    1.0
22196    2.0
22197    NaN
22198    1.0
22199    1.0
22200    1.0
22201    NaN
22202    1.0
22203    1.0
22204    1.0
22205    2.0
22206    1.0
22207    1.0
22208    2.0
22209    1.0
22210    NaN
22211    2.0
22212    2.0
22213    1.0
Name: class, Length: 19992, dtype: float64


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
from sklearn.model_selection import KFold

0.8816855753646677