In [1]:
def warn(*args, **kwargs):
    pass

import warnings
warnings.warn = warn

from sklearn.feature_extraction.text import CountVectorizer as CV, TfidfVectorizer as TV
from sklearn.svm import LinearSVC as SVM
from sklearn.model_selection import train_test_split as TrTeS
from sklearn.metrics import accuracy_score as AS
from nltk.stem.porter import PorterStemmer as PS
from nltk.stem import WordNetLemmatizer as WNL

import re

In [2]:
DATASET_PATH      = "Datasets/"
DATASET_PATH_MORE = "twitter-datasets/"

POS_DATASET  = "train_pos"
NEG_DATASET  = "train_neg"

FULL_DATASETS_EXT = "_full"
EXT = ".txt"

TEST_FILE_ORIGINAL = "test_data.txt"

SUBMISSION_FILE = "submission.csv"

In [3]:
def simple_clean(t):
    t = t.replace("<user>", "")
    t = t.replace("<url>", "")
    t = t.replace("\n", "")
    t = re.compile("[.;:!\'?,\"()\[\]]").sub("", t.lower())
    
    return t
    
def clean(tweets):
    return [simple_clean(t) for t in tweets]

def read_tweets_file(pos=True, full=True):
    path = DATASET_PATH+DATASET_PATH_MORE
    
    if pos:
        path += POS_DATASET
    else:
        path += NEG_DATASET
    
    if full:
        path += FULL_DATASETS_EXT
    
    path += EXT
    
    tweets = []
    with open(path, 'r') as file:
        for line in file:
            tweets.append(line)
    
    return tweets

def get_tweets(full=True):
    return read_tweets_file(True,full), read_tweets_file(False,full)

def get_submission_tweets():
    tweets = []
    with open(DATASET_PATH+TEST_FILE_ORIGINAL, 'r') as file:
        for line in file:
            tweets.append(",".join(line.split(",")[1:]))
    
    return tweets

# simple

In [4]:
pos, neg = get_tweets()
pos, neg = clean(pos), clean(neg)
tr       = pos+neg

In [5]:
cv3 = CV(binary=True)
cv3.fit(tr)

tra = cv3.transform(tr)

labels = [1 if i < len(pos) else -1 for i in range(len(pos)+len(neg))]

tr_x, te_x, tr_labels, te_labels = TrTeS(tra, labels, train_size=0.75)

In [6]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    svm = SVM(C=c, max_iter=60)
    svm.fit(tr_x, tr_labels)
    print ("Accuracy for {}: {}".format(c, AS(te_labels, svm.predict(te_x))))

Accuracy for 0.01: 0.8193968
Accuracy for 0.05: 0.8227648
Accuracy for 0.25: 0.82264
Accuracy for 0.5: 0.8213968
Accuracy for 1: 0.8182656


# 2-grams

In [7]:
cv3 = CV(binary=True, ngram_range=(1, 2))
cv3.fit(tr)

tra = cv3.transform(tr)

labels = [1 if i < len(pos) else -1 for i in range(len(pos)+len(neg))]

tr_x, te_x, tr_labels, te_labels = TrTeS(tra, labels, train_size=0.75)

In [8]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    svm = SVM(C=c, max_iter=60)
    svm.fit(tr_x, tr_labels)
    print ("Accuracy for {}: {}".format(c, AS(te_labels, svm.predict(te_x))))

Accuracy for 0.01: 0.8511136
Accuracy for 0.05: 0.8555104
Accuracy for 0.25: 0.849704
Accuracy for 0.5: 0.8448864
Accuracy for 1: 0.839328


# 3-grams

In [9]:
cv3 = CV(binary=True, ngram_range=(1, 3))
cv3.fit(tr)

tra = cv3.transform(tr)

labels = [1 if i < len(pos) else -1 for i in range(len(pos)+len(neg))]

tr_x, te_x, tr_labels, te_labels = TrTeS(tra, labels, train_size=0.75)

In [10]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    svm = SVM(C=c, max_iter=60)
    svm.fit(tr_x, tr_labels)
    print ("Accuracy for {}: {}".format(c, AS(te_labels, svm.predict(te_x))))

Accuracy for 0.01: 0.8579552
Accuracy for 0.05: 0.8612224
Accuracy for 0.25: 0.8570288
Accuracy for 0.5: 0.854112
Accuracy for 1: 0.8507808


# 4-grams

In [11]:
cv3 = CV(binary=True, ngram_range=(1, 4))
cv3.fit(tr)

tra = cv3.transform(tr)

labels = [1 if i < len(pos) else -1 for i in range(len(pos)+len(neg))]

tr_x, te_x, tr_labels, te_labels = TrTeS(tra, labels, train_size=0.75)

In [12]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    svm = SVM(C=c, max_iter=60)
    svm.fit(tr_x, tr_labels)
    print ("Accuracy for {}: {}".format(c, AS(te_labels, svm.predict(te_x))))

Accuracy for 0.01: 0.858704
Accuracy for 0.05: 0.8614944
Accuracy for 0.25: 0.8579728
Accuracy for 0.5: 0.8556976
Accuracy for 1: 0.853192


# TF-IDF

In [13]:
tv3 = TV(binary=True)
tv3.fit(tr)

tra = cv3.transform(tr)

labels = [1 if i < len(pos) else -1 for i in range(len(pos)+len(neg))]

tr_x, te_x, tr_labels, te_labels = TrTeS(tra, labels, train_size=0.75)

In [14]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    svm = SVM(C=c, max_iter=60)
    svm.fit(tr_x, tr_labels)
    print ("Accuracy for {}: {}".format(c, AS(te_labels, svm.predict(te_x))))

Accuracy for 0.01: 0.8581696
Accuracy for 0.05: 0.861096
Accuracy for 0.25: 0.8576544
Accuracy for 0.5: 0.8552032
Accuracy for 1: 0.8523808
