In [1]:
def warn(*args, **kwargs):
    pass

import warnings
warnings.warn = warn

from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import train_test_split as TrTeS
from sklearn.metrics import accuracy_score as AS
from nltk.stem.porter import PorterStemmer as PS
from nltk.stem import WordNetLemmatizer as WNL

import re

In [2]:
DATASET_PATH      = "Datasets/"
DATASET_PATH_MORE = "twitter-datasets/"

POS_DATASET  = "train_pos"
NEG_DATASET  = "train_neg"

FULL_DATASETS_EXT = "_full"
EXT = ".txt"

TEST_FILE_ORIGINAL = "test_data.txt"

SUBMISSION_FILE = "submission.csv"

In [3]:
def simple_clean(t):
    t = t.replace("<user>", "")
    t = t.replace("<url>", "")
    t = t.replace("\n", "")
    t = re.compile("[.;:!\'?,\"()\[\]]").sub("", t.lower())
    
    return t
    
def clean(tweets):
    return [simple_clean(t) for t in tweets]

def remove_duplicates(tweets):
    return clean(list(set(tweets)))

def normalize_stemming(tweets):
    s = PS()
    return [" ".join([s.stem(w) for w in t.split()]) for t in remove_duplicates(tweets)]

def normalize_lemmatization(tweets):
    l = WNL()
    return [" ".join([l.lemmatize(w) for w in t.split()]) for t in remove_duplicates(tweets)]

def get_stop_words(pos, neg):
    stop_words = {}
    for t in pos+neg:
        for w in t.split():
            if w in stop_words:
                stop_words[w] += 1
            else:
                stop_words[w]  = 1
                
    stop_words = [(k,v) for k,v in stop_words.items()]
    stop_words.sort(key=lambda t : t[1], reverse=True)
    
    return [t[0] for t in stop_words][:10]

def read_tweets_file(pos=True, full=True):
    path = DATASET_PATH+DATASET_PATH_MORE
    
    if pos:
        path += POS_DATASET
    else:
        path += NEG_DATASET
    
    if full:
        path += FULL_DATASETS_EXT
    
    path += EXT
    
    tweets = []
    with open(path, 'r') as file:
        for line in file:
            tweets.append(line)
    
    return tweets

def get_tweets(full=True):
    return read_tweets_file(True,full), read_tweets_file(False,full)

def get_submission_tweets():
    tweets = []
    with open(DATASET_PATH+TEST_FILE_ORIGINAL, 'r') as file:
        for line in file:
            tweets.append(",".join(line.split(",")[1:]))
    
    return tweets

# dirty data

In [4]:
pos, neg = get_tweets()
tr       = pos+neg

In [5]:
cv3 = CV(binary=True)
cv3.fit(tr)

tra = cv3.transform(tr)

labels = [1 if i < len(pos) else -1 for i in range(len(pos)+len(neg))]

tr_x, te_x, tr_labels, te_labels = TrTeS(tra, labels, train_size=0.75)

In [6]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LR(C=c, max_iter=30)
    lr.fit(tr_x, tr_labels)
    print ("Accuracy for {}: {}".format(c, AS(te_labels, lr.predict(te_x))))

Accuracy for 0.01: 0.7960112
Accuracy for 0.05: 0.7993824
Accuracy for 0.25: 0.8000896
Accuracy for 0.5: 0.8002784
Accuracy for 1: 0.7989648


# clean data

In [7]:
pos, neg = get_tweets()
pos, neg = clean(pos), clean(neg)
tr       = pos+neg

In [8]:
cv3 = CV(binary=True)
cv3.fit(tr)

tra = cv3.transform(tr)

labels = [1 if i < len(pos) else -1 for i in range(len(pos)+len(neg))]

tr_x, te_x, tr_labels, te_labels = TrTeS(tra, labels, train_size=0.75)

In [9]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LR(C=c, max_iter=30)
    lr.fit(tr_x, tr_labels)
    print ("Accuracy for {}: {}".format(c, AS(te_labels, lr.predict(te_x))))

Accuracy for 0.01: 0.8032704
Accuracy for 0.05: 0.8045456
Accuracy for 0.25: 0.8044304
Accuracy for 0.5: 0.8034944
Accuracy for 1: 0.8040016


# no duplicates

In [10]:
pos, neg = get_tweets()
pos, neg = remove_duplicates(pos), remove_duplicates(neg)
tr       = pos+neg

In [11]:
cv3 = CV(binary=True)
cv3.fit(tr)

tra = cv3.transform(tr)

labels = [1 if i < len(pos) else -1 for i in range(len(pos)+len(neg))]

tr_x, te_x, tr_labels, te_labels = TrTeS(tra, labels, train_size=0.75)

In [12]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LR(C=c, max_iter=30)
    lr.fit(tr_x, tr_labels)
    print ("Accuracy for {}: {}".format(c, AS(te_labels, lr.predict(te_x))))

Accuracy for 0.01: 0.7958162224442014
Accuracy for 0.05: 0.7965085858345622
Accuracy for 0.25: 0.7965508675683246
Accuracy for 0.5: 0.7965614380017653
Accuracy for 1: 0.7965138710512825


# stemming

In [13]:
pos, neg = get_tweets()
pos, neg = normalize_stemming(pos), normalize_stemming(neg)
tr       = pos+neg

In [14]:
cv3 = CV(binary=True)
cv3.fit(tr)

tra = cv3.transform(tr)

labels = [1 if i < len(pos) else -1 for i in range(len(pos)+len(neg))]

tr_x, te_x, tr_labels, te_labels = TrTeS(tra, labels, train_size=0.75)

In [15]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LR(C=c, max_iter=30)
    lr.fit(tr_x, tr_labels)
    print ("Accuracy for {}: {}".format(c, AS(te_labels, lr.predict(te_x))))

Accuracy for 0.01: 0.7937620348789068
Accuracy for 0.05: 0.7939223531194229
Accuracy for 0.25: 0.7939047357303552
Accuracy for 0.5: 0.7938465983464319
Accuracy for 1: 0.7939205913805162


# lemmatization

In [16]:
pos, neg = get_tweets()
pos, neg = normalize_lemmatization(pos), normalize_lemmatization(neg)
tr       = pos+neg

In [17]:
cv3 = CV(binary=True)
cv3.fit(tr)

tra = cv3.transform(tr)

labels = [1 if i < len(pos) else -1 for i in range(len(pos)+len(neg))]

tr_x, te_x, tr_labels, te_labels = TrTeS(tra, labels, train_size=0.75)

In [18]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LR(C=c, max_iter=30)
    lr.fit(tr_x, tr_labels)
    print ("Accuracy for {}: {}".format(c, AS(te_labels, lr.predict(te_x))))

Accuracy for 0.01: 0.7945107739142844
Accuracy for 0.05: 0.7966142901689683
Accuracy for 0.25: 0.797054724895661
Accuracy for 0.5: 0.797438783977337
Accuracy for 1: 0.7973859318101338


# stop words

In [19]:
pos, neg = get_tweets()
pos, neg = remove_duplicates(pos), remove_duplicates(neg)
tr       = pos+neg

In [20]:
cv3 = CV(binary=True, stop_words=get_stop_words(pos,neg))
cv3.fit(tr)

tra = cv3.transform(tr)

labels = [1 if i < len(pos) else -1 for i in range(len(pos)+len(neg))]

tr_x, te_x, tr_labels, te_labels = TrTeS(tra, labels, train_size=0.75)

In [21]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LR(C=c, max_iter=30)
    lr.fit(tr_x, tr_labels)
    print ("Accuracy for {}: {}".format(c, AS(te_labels, lr.predict(te_x))))

Accuracy for 0.01: 0.7968944066551449
Accuracy for 0.05: 0.7981470030178588
Accuracy for 0.25: 0.7997854202011554
Accuracy for 0.5: 0.7998964097522819
Accuracy for 1: 0.7995652028378091
