In [1]:
import numpy as np
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from helpers import *

# Do some cleaning before embedding and coocurrences

In [2]:
def clean_txt(filename, unique, spelling, spelling_dict, negation, filtered, common):
    """
    Create a new clean file from the txt file containing the tweets
    <filename> is the file we want to clean
    <unique>, a boolean, to say if we want to remove identique tweets
    <spelling>, a boolean, to say if we want to replace some words in the tweets
    <spelling_dict>, a dictionary, contains which words should be replaced and by what
    <negation>, a boolean, to say if we want to transform verbs like don't into do not
    <filtered>, a boolean, to say if we want to remove common words
    <common>, a list of common words to remove     
    """
    new_filename = filename.replace(".txt", "_clean.txt")
    try:
        os.remove(new_filename)
    except OSError:
        pass
    
    prev_f = open(filename, "r")
    new_f = f= open(new_filename,"w+")
    
    tweets = prev_f.readlines()
    
    if(unique):
        tweets = list(set(tweets))
        
    tweets_1 = []   
    for t in tweets:
        new_t = t.split(" ")
        new_t[-1] = new_t[-1].replace('\n', "")
        tweets_1.append(new_t)
    tweets = tweets_1

    if(spelling):
        new_tweets = []
        for t in tweets:
            new_tweets.append(transform_spelling(t, spelling_dict))
        tweets = new_tweets
               
    if(negation):
        new_tweets = []
        for t in tweets:
            new_tweets.append(transform_negation(t))
        tweets = new_tweets
        
    if(filtered):
        new_tweets = []
        for t in tweets:
            new_tweets.append(remove_words(common, t))
        tweets = new_tweets            
            
    for t in tweets:
        new_t = " ".join(t)
        new_f.write(new_t)
        new_f.write('\n')
    
    
    prev_f.close()
    new_f.close()

In [54]:
#test
spelling_test = {"bla" : "ble", "blo" : "blu"}
common_test = ["!", "plop"]
clean_txt("clean_test.txt", True, True, spelling_test, True, True, common_test)

In [4]:
clean_txt("Datasets/test_data.txt", False, True, spelling_dict, True, True, common)

# Create embeddings and coocurrences

In [None]:
#Create embeddings with glove_solution
from glove_solution import main as glove_main

In [None]:
# Raw data
file_name = 'cooc.pkl'

In [None]:
# Cleaned data
file_name = 'cooc_clean.pkl'

In [None]:
glove_main(file_name)

In [2]:
from pattern.en import *

In [3]:
word_embeddings = np.load("embeddings.npy")
word_embeddings.shape

(21161, 250)

In [4]:
words_list = words_list("vocab_cut.txt")

In [5]:
tweets_pos_txt = tweets_txt("Datasets/twitter-datasets/train_pos.txt")
tweets_neg_txt = tweets_txt("Datasets/twitter-datasets/train_neg.txt")

# Do some data cleaning after embedding and coocurrences

In [59]:
#Tests
test_tweets = ["Premier tweet do plop don't", "Deuxième blah cannot tweet can't blah", \
               "Premier tweet do plop don't", "Premier tweet do plop don't"]
#Remove duplicates :
unique = remove_duplicated_tweets_txt(test_tweets)
print(unique)

#Spelling :
spelling_test = {"blah" : "bleb", "plop" : "splash"}
spelling = []
for u in unique:
    spelling.append(transform_spelling(u.split(" "), spelling_test))
print(spelling)

#Negation
negated = []
for s in spelling:
    negated.append(transform_negation(s))
print(negated)

#Filter :
common_test = ["tweet", "Premier"]

filtered = []
for n in negated:
    filtered.append(remove_words(common_test, n))
print(filtered)

["Deuxième blah cannot tweet can't blah" "Premier tweet do plop don't"]
[['Deuxième', 'bleb', 'cannot', 'tweet', "can't", 'bleb'], ['Premier', 'tweet', 'do', 'splash', "don't"]]
[['Deuxième', 'bleb', 'cannot', 'tweet', 'can', 'not', 'bleb'], ['Premier', 'tweet', 'do', 'splash', 'do', 'not']]
[['Deuxième', 'bleb', 'cannot', 'can', 'not', 'bleb'], ['do', 'splash', 'do', 'not']]


In [6]:
#Remove duplicates
unique_tweets_pos_txt = remove_duplicated_tweets_txt(tweets_pos_txt)
unique_tweets_neg_txt = remove_duplicated_tweets_txt(tweets_neg_txt)

In [7]:
#Spelling changes
spelling_dict = {"u" : "you", "dont" : "don't", "cant" : "can't", "r" : "are", "wont" : "won't"}

#Common words to remove
common = ['"', ',', '.', ')', '(', '-', \
                "<url>", "a", "the", "of", "to", \
                "it", "this", "that", "these", "there"]

In [8]:
#Filter tweets
tweets_pos_vec_clean = tweet_means(unique_tweets_pos_txt, word_embeddings,
                             words_list, word_embeddings[0].shape[0],
                             True, spelling_dict, 
                             True, 
                             True, common)
tweets_neg_vec_clean = tweet_means(unique_tweets_neg_txt, word_embeddings, 
                             words_list, word_embeddings[0].shape[0], 
                             True, spelling_dict, 
                             True, 
                             True, common)

np.save('tweets_pos_vec_clean', tweets_pos_vec_clean)
np.save('tweets_neg_vec_clean', tweets_neg_vec_clean)

In [None]:
#Means for tweets not filtered
tweets_pos_vec = tweet_means(tweets_pos_txt, \
    word_embeddings, words_list, word_embeddings[0].shape[0])
tweets_neg_vec = tweet_means(tweets_new_txt, \
    word_embeddings, words_list, word_embeddings[0].shape[0])

np.save('tweets_pos_vec', tweets_pos_vec)
np.save('tweets_neg_vec', tweets_neg_vec)

In [None]:
np.save('tweets_pos_vec', tweets_pos_vec)
np.save('tweets_neg_vec', tweets_neg_vec)

# TRAINING TIME

In [9]:
tweets_pos_vec = np.load('tweets_pos_vec_clean.npy')
tweets_neg_vec = np.load('tweets_neg_vec_clean.npy')

In [None]:
#TO USE IF ALGO ASKS LABEL TO BE 1 AND -1
X = np.concatenate((tweets_pos_vec, tweets_neg_vec))
y = np.concatenate((np.ones((tweets_pos_vec.shape[0])), -1*np.ones((tweets_neg_vec.shape[0]))))

In [10]:
#TO USE IF ALGO ASKS LABEL TO BE 1 AND 0
X = np.concatenate((tweets_pos_vec, tweets_neg_vec))
y = np.concatenate((np.ones((tweets_pos_vec.shape[0])), np.zeros((tweets_neg_vec.shape[0]))))

In [None]:
#NEED TO CHECK IF IT IS REALLY WHAT SCORING ASKS
def accuracy(estimator, X, y):
    return np.mean(y==estimator.predict(X))

In [None]:
numpy.random.seed(42)
clf = ???
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=42)
scores = cross_val_score(estimator=clf, X, y, scoring=accuracy, cv=cv)

In [11]:
# VERSION SAMUEL : 

j = 0.9

indices = np.random.permutation(len(y))

X = X[indices]
y = y[indices]

train_X = X[0:int(j*len(y))]
train_y = y[0:int(j*len(y))]
test_X = X[int(j*len(y)):]
test_y = y[int(j*len(y)):]

indices = np.random.permutation(len(y))

X = X[indices]
y = y[indices]

def accuracy(y1, y2):
    return np.mean(y1==y2)

lsvc = svm.LinearSVC(max_iter=10000)
lsvc.fit(train_X, train_y)
print("SVM : " + str(accuracy(test_y, lsvc.predict(test_X))))

sgdcl = SGDClassifier()
sgdcl.fit(train_X, train_y)
print("SGD : " + str(accuracy(test_y, sgdcl.predict(test_X))))

lr = LogisticRegression()
lr.fit(train_X, train_y)
print("LR : " + str(accuracy(test_y, lr.predict(test_X))))

mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000)
mlp.fit(train_X, train_y)
print("MLP : " + str(accuracy(test_y, mlp.predict(test_X))))

mlp = MLPClassifier(hidden_layer_sizes=(400,200,100), max_iter=1000, activation='relu')
mlp.fit(train_X, train_y)
print("MLP : hidden_layer_sizes=(400,200,100), max_iter=1000, activation='relu'" + str(accuracy(test_y, mlp.predict(test_X))))

SVM : 0.6342579826835052
SGD : 0.6387249765620692




LR : 0.6327689847239839
MLP : 0.6565378040037501
MLP : hidden_layer_sizes=(400,200,100), max_iter=1000, activation='relu'0.6565378040037501


In [None]:
# Accuracy after clean (after embedding) + Samuel pipeline (embedding = 20 features)

# Spelling changes
spelling_dict = {"u" : "you", "dont" : "don't", "cant" : "can't", "r" : "are", "wont" : "won't"}

# Common words removed
common = ['"', ',', '.', ')', '(', '-', \
                "<user>", "<url>", "a", "the", "of", "to", \
                "it", "this", "that", "these", "there"]
# Means with spelling, negation and clean to true

# Results : 


In [None]:
# Accuracy after clean (after embedding) + Samuel pipeline (embedding = 20 features) 
# Without removing user -> better

# Spelling changes
spelling_dict = {"u" : "you", "dont" : "don't", "cant" : "can't", "r" : "are", "wont" : "won't"}

# Common words removed
common = ['"', ',', '.', ')', '(', '-', \
                "<url>", "a", "the", "of", "to", \
                "it", "this", "that", "these", "there"]
# Means with spelling, negation and clean to true

# Results : 


In [None]:
# Accuracy after clean (after embedding) + Samuel pipeline (embedding = 250 features) 
# Without removing user -> better

# Spelling changes
spelling_dict = {"u" : "you", "dont" : "don't", "cant" : "can't", "r" : "are", "wont" : "won't"}

# Common words removed
common = ['"', ',', '.', ')', '(', '-', \
                "<url>", "a", "the", "of", "to", \
                "it", "this", "that", "these", "there"]
# Means with spelling, negation and clean to true

# Results : 
SVM : 0.6342579826835052
SGD : 0.6387249765620692
LR : 0.6327689847239839
MLP : 0.6565378040037501
MLP : hidden_layer_sizes=(400,200,100), max_iter=1000, activation='relu'0.6565378040037501