In [1]:
def warn(*args, **kwargs):
    pass

import warnings
warnings.warn = warn

from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.svm import LinearSVC as SVM
from sklearn.model_selection import train_test_split as TrTeS
from sklearn.metrics import accuracy_score as AS
from nltk.stem.porter import PorterStemmer as PS
from nltk.stem import WordNetLemmatizer as WNL

import re

In [2]:
DATASET_PATH      = "Datasets/"
DATASET_PATH_MORE = "twitter-datasets/"

POS_DATASET  = "train_pos"
NEG_DATASET  = "train_neg"

FULL_DATASETS_EXT = "_full"
EXT = ".txt"

TEST_FILE_ORIGINAL = "test_data.txt"

SUBMISSION_FILE = "submission.csv"

In [3]:
def simple_clean(t):
    t = t.replace("<user>", "")
    t = t.replace("<url>", "")
    t = t.replace("\n", "")
    t = re.compile("[.;:!\'?,\"()\[\]]").sub("", t.lower())
    
    return t
    
def clean(tweets):
    return [simple_clean(t) for t in tweets]

def remove_duplicates(tweets):
    return list(set(clean(tweets)))

def normalize_stemming(tweets):
    s = PS()
    return [" ".join([s.stem(w) for w in t.split()]) for t in remove_duplicates(tweets)]

def normalize_lemmatization(tweets):
    l = WNL()
    return [" ".join([l.lemmatize(w) for w in t.split()]) for t in remove_duplicates(tweets)]

def get_stop_words(pos, neg):
    stop_words = {}
    for t in pos+neg:
        for w in t.split():
            if w in stop_words:
                stop_words[w] += 1
            else:
                stop_words[w]  = 1
                
    stop_words = [(k,v) for k,v in stop_words.items()]
    stop_words.sort(key=lambda t : t[1], reverse=True)
    
    return [t[0] for t in stop_words][:10]

def read_tweets_file(pos=True, full=True):
    path = DATASET_PATH+DATASET_PATH_MORE
    
    if pos:
        path += POS_DATASET
    else:
        path += NEG_DATASET
    
    if full:
        path += FULL_DATASETS_EXT
    
    path += EXT
    
    tweets = []
    with open(path, 'r') as file:
        for line in file:
            tweets.append(line)
    
    return tweets

def get_tweets(full=True):
    return read_tweets_file(True,full), read_tweets_file(False,full)

def get_submission_tweets():
    tweets = []
    with open(DATASET_PATH+TEST_FILE_ORIGINAL, 'r') as file:
        for line in file:
            tweets.append(",".join(line.split(",")[1:]))
    
    return tweets

# dirty data

In [4]:
pos, neg = get_tweets()
tr       = pos+neg

In [5]:
cv3 = CV(binary=True)
cv3.fit(tr)

tra = cv3.transform(tr)

labels = [1 if i < len(pos) else -1 for i in range(len(pos)+len(neg))]

tr_x, te_x, tr_labels, te_labels = TrTeS(tra, labels, train_size=0.75)

In [6]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    svm = SVM(C=c, max_iter=30)
    svm.fit(tr_x, tr_labels)
    print ("Accuracy for {}: {}".format(c, AS(te_labels, svm.predict(te_x))))

Accuracy for 0.01: 0.8187904
Accuracy for 0.05: 0.8218192
Accuracy for 0.25: 0.8216064
Accuracy for 0.5: 0.8196304
Accuracy for 1: 0.8129216


# clean data

In [7]:
pos, neg = get_tweets()
pos, neg = clean(pos), clean(neg)
tr       = pos+neg

In [8]:
cv3 = CV(binary=True)
cv3.fit(tr)

tra = cv3.transform(tr)

labels = [1 if i < len(pos) else -1 for i in range(len(pos)+len(neg))]

tr_x, te_x, tr_labels, te_labels = TrTeS(tra, labels, train_size=0.75)

In [9]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    svm = SVM(C=c, max_iter=30)
    svm.fit(tr_x, tr_labels)
    print ("Accuracy for {}: {}".format(c, AS(te_labels, svm.predict(te_x))))

Accuracy for 0.01: 0.820032
Accuracy for 0.05: 0.8230944
Accuracy for 0.25: 0.8227296
Accuracy for 0.5: 0.8206592
Accuracy for 1: 0.8117648


# no duplicates

In [10]:
pos, neg = get_tweets()
pos, neg = remove_duplicates(pos), remove_duplicates(neg)
tr       = pos+neg

In [11]:
cv3 = CV(binary=True)
cv3.fit(tr)

tra = cv3.transform(tr)

labels = [1 if i < len(pos) else -1 for i in range(len(pos)+len(neg))]

tr_x, te_x, tr_labels, te_labels = TrTeS(tra, labels, train_size=0.75)

In [12]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    svm = SVM(C=c, max_iter=30)
    svm.fit(tr_x, tr_labels)
    print ("Accuracy for {}: {}".format(c, AS(te_labels, svm.predict(te_x))))

Accuracy for 0.01: 0.811226148409894
Accuracy for 0.05: 0.8135724381625442
Accuracy for 0.25: 0.81210777385159
Accuracy for 0.5: 0.8095335689045936
Accuracy for 1: 0.7998321554770318


# stemming

In [13]:
pos, neg = get_tweets()
pos, neg = normalize_stemming(pos), normalize_stemming(neg)
tr       = pos+neg

In [14]:
cv3 = CV(binary=True)
cv3.fit(tr)

tra = cv3.transform(tr)

labels = [1 if i < len(pos) else -1 for i in range(len(pos)+len(neg))]

tr_x, te_x, tr_labels, te_labels = TrTeS(tra, labels, train_size=0.75)

In [15]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    svm = SVM(C=c, max_iter=30)
    svm.fit(tr_x, tr_labels)
    print ("Accuracy for {}: {}".format(c, AS(te_labels, svm.predict(te_x))))

Accuracy for 0.01: 0.8056272084805653
Accuracy for 0.05: 0.8078922261484099
Accuracy for 0.25: 0.8075424028268551
Accuracy for 0.5: 0.804821554770318
Accuracy for 1: 0.795291519434629


# lemmatization

In [16]:
pos, neg = get_tweets()
pos, neg = normalize_lemmatization(pos), normalize_lemmatization(neg)
tr       = pos+neg

In [17]:
cv3 = CV(binary=True)
cv3.fit(tr)

tra = cv3.transform(tr)

labels = [1 if i < len(pos) else -1 for i in range(len(pos)+len(neg))]

tr_x, te_x, tr_labels, te_labels = TrTeS(tra, labels, train_size=0.75)

In [18]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    svm = SVM(C=c, max_iter=30)
    svm.fit(tr_x, tr_labels)
    print ("Accuracy for {}: {}".format(c, AS(te_labels, svm.predict(te_x))))

Accuracy for 0.01: 0.8091448763250884
Accuracy for 0.05: 0.8118904593639577
Accuracy for 0.25: 0.81086925795053
Accuracy for 0.5: 0.8085335689045936
Accuracy for 1: 0.7976925795053004


# stop words

In [19]:
pos, neg = get_tweets()
pos, neg = remove_duplicates(pos), remove_duplicates(neg)
tr       = pos+neg

In [20]:
cv3 = CV(binary=True, stop_words=get_stop_words(pos,neg))
cv3.fit(tr)

tra = cv3.transform(tr)

labels = [1 if i < len(pos) else -1 for i in range(len(pos)+len(neg))]

tr_x, te_x, tr_labels, te_labels = TrTeS(tra, labels, train_size=0.75)

In [21]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    svm = SVM(C=c, max_iter=30)
    svm.fit(tr_x, tr_labels)
    print ("Accuracy for {}: {}".format(c, AS(te_labels, svm.predict(te_x))))

Accuracy for 0.01: 0.8084840989399293
Accuracy for 0.05: 0.8109010600706714
Accuracy for 0.25: 0.8095017667844523
Accuracy for 0.5: 0.8072968197879858
Accuracy for 1: 0.8010901060070671
