In [2]:
#from twitter_preprocess import *
import csv
import argparse
import pandas as pd
import numpy as np
import re
import os
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.svm import SVC

TRAIN = '../data/train/training_data.csv'
train_data = pd.read_csv(TRAIN, index_col=0)
dev_data = pd.read_csv('../data/dev/development_data.csv', index_col=0)


# get unigram counts for data
def get_unigrams_splitBySpace():
    unigrams = Counter()
    tweets = train_data[['tweet']]
    for row_index, row in tweets.iterrows():
        s = row['tweet'].split()
        for word in s:
            unigrams[word] += 1
    return unigrams


            

In [11]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
# get unigram counts for data
def get_unigrams_nltkTokenizer():
    uni = Counter()
    tweets = train_data[['tweet']]
    for row_index, row in tweets.iterrows():
        s = tknzr.tokenize(row['tweet'])
        for word in s:
            uni[word] += 1    
    return uni


In [4]:
# 
def get_bigrams():
    bigrams = Counter()
    tweets = train_data[['tweet']]
    start = "<s>"
    end = "</s>"
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        word = start + " " + s[0]
        bigrams[word] += 1
        for i in range(len(s)-1):
            word = s[i] + " " + s[i+1]
            bigrams[word] += 1
        word = s[len(s) - 1] + " " + end
        bigrams[word] += 1
    #print(bigrams)
    return bigrams

In [5]:

#Gets the average word counts for data
#maybe split tweet on more than just whitespace ie ;:,.')(
def get_avg_wc():
    wcs = {}
    tweets = train_data[['tweet']]
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        tot = 0.
        for word in s:
            if "http://" in word: continue #ignore hyperlinks
            tot += len(word)
        wcs[' '.join(s)] = tot / len(s)
    return wcs


In [6]:
#Gets the count of '@'s in the data
def get_at_counts():
    ats = {}
    tweets = train_data[['tweet']]
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        count = sum(map(lambda word : 1 if '@' in word else 0, s))
        ats[' '.join(s)] = count
            
    return ats

In [7]:
#Gets the count of swear words in tweets
#potentially use regex's to catch purposeful mispellings & other nuances
def get_swear_counts():
    tweets = train_data[['tweet']]
#     bad_word_list = open("en_bad_words.txt").read().replace("\n", "").split(',')
#     for i in range(len(bad_word_list)):
#         bad_word_list[i] = bad_word_list[i].strip(" ")
    bad_words_set = set(open("bad-words.txt").read().split())
    
    bad_words_count = {}
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        tot_bad = 0
        for word in s:                #Use regexs? 
            word = word.replace(".","").replace(",","").replace("!","").replace("?","")
            if word.lower() in bad_words_set:
                tot_bad+=1
            bad_words_count[" ".join(s)] = tot_bad
            
    return bad_words_count
    

In [12]:
unigram_counts = get_unigrams_splitBySpace()
uni_tokenizer_counts = get_unigrams_nltkTokenizer()
bigram_counts = get_bigrams()
avg_wc = get_avg_wc()
at_counts = get_at_counts()
swear_counts = get_swear_counts()

In [13]:
#get top 100 unigrams and bigrams
top_unigrams = unigram_counts.most_common(100)
top_tokenized = uni_tokenizer_counts.most_common(100)
top_bigrams = bigram_counts.most_common(100)
data = pd.concat([train_data, dev_data])

In [14]:
def process_tweets(tweets):
    for word in [u[0] for u in top_unigrams]:
        tweets[word] = tweets['tweet'].str.contains(word).astype(int)
        
    word_counts = []
    swear_counts = []
    at_counts = []
    bad_words_set = set(open("bad-words.txt").read().split())

    for tweet in tweets['tweet']:
        tweet_words = tweet.split()
        word_counts.append(len(tweet_words))
        tot_bad = 0
        for word in tweet_words:                #Use regexs? 
            word = word.replace(".","").replace(",","").replace("!","").replace("?","")
            if word.lower() in bad_words_set:
                tot_bad+=1
        swear_counts.append(tot_bad)
        at_count = tweet_words.count('@')
        at_counts.append(at_count)

    tweets['Word Counts'] = word_counts
    tweets['Swear Counts'] = swear_counts
    tweets['@ Counts'] = at_counts
    X = tweets[[col for col in tweets.columns if col!="tweet"]].values
    return X

In [15]:
tweets = data[['tweet']]
X = process_tweets(tweets)
y = data['class'].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
cv = KFold(n_splits=10, random_state=42, shuffle=False)
LR_scores = []
SVM_scores = []

for train_index, test_index in cv.split(X):
    LR = LogisticRegression(fit_intercept=True, max_iter=1000, solver='lbfgs', multi_class='ovr')
    svm = SVC(gamma='auto') 
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
    LR.fit(X_train, y_train)
    LR_scores.append(LR.score(X_test, y_test))
    svm.fit(X_train, y_train)
    SVM_scores.append(svm.score(X_test, y_test))
    

In [13]:
LR_scores

[0.8618361836183618,
 0.8550855085508551,
 0.8667866786678667,
 0.8861386138613861,
 0.8716794236830256,
 0.875281404772625,
 0.8779828905898244,
 0.8923908149482215,
 0.8829356145880234,
 0.8793336334984241]

In [1]:
SVM_scores

NameError: name 'SVM_scores' is not defined