In [None]:
import re

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
DATASET_PATH      = "Datasets/"
DATASET_PATH_MORE = "twitter-datasets/"

POS_DATASET  = "train_pos"
NEG_DATASET  = "train_neg"

FULL_DATASETS_EXT = "_full"
EXT = ".txt"

TEST_FILE_ORIGINAL = "test_data.txt"

In [None]:
def read_tweets_file(train, pos, full):
    path   = DATASET_PATH+DATASET_PATH_MORE
    tweets = []

    if train:
        if pos:
            path += POS_DATASET
        else:
            path += NEG_DATASET

        if full:
            path += FULL_DATASETS_EXT

        path += EXT
        
        with open(path, 'r') as file:
            for line in file:
                tweets.append(line)
    else:
        path += TEST_FILE_ORIGINAL
        with open(path, 'r') as file:
            for line in file:
                tweets.append("'".join(line.split(",")[1:]))

    return tweets

def save_tweets_file(directory, tweets, train, pos, full):
    path = DATASET_PATH+DATASET_PATH_MORE+directory
    
    if train:
        if pos:
            path += POS_DATASET
        else:
            path += NEG_DATASET

        if full:
            path += FULL_DATASETS_EXT

        path += EXT
    else:
        path += TEST_FILE_ORIGINAL
        
    with open(path, 'w') as file:
        c = 1
        for t in tweets:
            file.write(t+"\n")
            
def get_train_pos_full():
    return read_tweets_file(train=True, pos=True,  full=True)

def get_train_neg_full():
    return read_tweets_file(train=True, pos=False, full=True)

def get_train_pos():
    return read_tweets_file(train=True, pos=True,  full=False)

def get_train_neg():
    return read_tweets_file(train=True, pos=False, full=False)

def get_test():
    return read_tweets_file(train=False, pos=False, full=False)

In [None]:
def simple_clean(t):
    t = t.replace("<user>", "he")
    t = t.replace("<url>", "it")
    t = t.replace(" #", " ")
    t = t.replace("\n", "")
    t = re.compile("[.;:!\'?,\"()\[\]]").sub("", t.lower())
    t = re.sub('\s{2,}', ' ', t)
    
    if t[0] == " ":
        return t[1:]
    else:
        return t
    
def clean(tweets):
    return [simple_clean(t) for t in tweets]
    
def normalize_stemming(tweets):
    s = PorterStemmer()
    return [" ".join([s.stem(w) for w in t.split()]) for t in tweets]

def normalize_lemmatization(tweets):
    l = WordNetLemmatizer()
    return [" ".join([l.lemmatize(w) for w in t.split()]) for t in tweets]

In [None]:
train_pos_full = get_train_pos_full()
train_neg_full = get_train_neg_full()

test = get_test()

In [None]:
cleaned_train_pos_full = clean(train_pos_full)
cleaned_train_neg_full = clean(train_neg_full)

cleaned_test = clean(test)

In [None]:
save_tweets_file("cleaned/", cleaned_train_pos_full, True,  True,  True)
save_tweets_file("cleaned/", cleaned_train_neg_full, True,  False, True)
save_tweets_file("cleaned/", cleaned_test,           False, False, False)

In [None]:
s_cleaned_train_pos_full = normalize_stemming(cleaned_train_pos_full)
s_cleaned_train_neg_full = normalize_stemming(cleaned_train_neg_full)

s_cleaned_test = normalize_stemming(cleaned_test)

In [None]:
save_tweets_file("cleaned/stemming/", s_cleaned_train_pos_full, True,  True,  True)
save_tweets_file("cleaned/stemming/", s_cleaned_train_neg_full, True,  False, True)
save_tweets_file("cleaned/stemming/", s_cleaned_test,            False, False, False)

In [None]:
l_cleaned_train_pos_full = normalize_lemmatization(cleaned_train_pos_full)
l_cleaned_train_neg_full = normalize_lemmatization(cleaned_train_neg_full)

l_cleaned_test = normalize_lemmatization(cleaned_test)

In [None]:
save_tweets_file("cleaned/lemmatization/", l_cleaned_train_pos_full, True,  True,  True)
save_tweets_file("cleaned/lemmatization/", l_cleaned_train_neg_full, True,  False, True)
save_tweets_file("cleaned/lemmatization/", l_cleaned_test,           False, False, False)

In [None]:
def get_stop_words(pos, neg):
    stop_words = {}
    for t in pos+neg:
        for w in t.split():
            if w in stop_words:
                stop_words[w] += 1
            else:
                stop_words[w]  = 1
                
    stop_words = [(k,v) for k,v in stop_words.items()]
    stop_words.sort(key=lambda t : t[1], reverse=True)
    
    return stop_words

def save_stop_words(directory, stop_words):
    path = DATASET_PATH+DATASET_PATH_MORE+directory
    with open(path, 'w') as file:
        for k,v in stop_words:
            file.write(k+" "+str(v)+"\n")

In [None]:
save_stop_words("cleaned/stop_words.txt",               get_stop_words(cleaned_train_pos_full,   cleaned_train_neg_full))
save_stop_words("cleaned/stemming/stop_words.txt",      get_stop_words(s_cleaned_train_pos_full, s_cleaned_train_neg_full))
save_stop_words("cleaned/lemmatization/stop_words.txt", get_stop_words(l_cleaned_train_pos_full, l_cleaned_train_neg_full))