In [1]:
import numpy as np
import pandas as pd
import text_representation as repre
import helpers
import preprocessing as preproc

#### Preprocessing

In [2]:
with open('../twitter-datasets/train_pos.txt', 'r') as pos_in, open('../twitter-datasets/train_neg.txt', 'r') as neg_in:
    pos_lines = pos_in.readlines()
    neg_lines = neg_in.readlines()
    pos_in.close()
    neg_in.close()

In [3]:
pos_processed = []
for line in pos_lines:
    pro_line = line
    pro_line = preproc.remove_unnecessary(pro_line)
    pro_line = preproc.replace_contraction(pro_line)
    pro_line = preproc.replace_numbers(pro_line)
    pro_line = preproc.replace_emoji(pro_line)
#     pro_line = preproc.replace_elongated_word(pro_line)
    pos_processed.append(pro_line)
    
helpers.write_file(pos_processed, 'train_pos_processed')

In [4]:
neg_processed = []
for line in neg_lines:
    pro_line = line
    pro_line = preproc.remove_unnecessary(pro_line)
    pro_line = preproc.replace_contraction(pro_line)
    pro_line = preproc.replace_numbers(pro_line)
    pro_line = preproc.replace_emoji(pro_line)
#     pro_line = preproc.replace_elongated_word(pro_line)
    neg_processed.append(pro_line)

helpers.write_file(neg_processed, 'train_neg_processed')

In [5]:
print(len(pos_processed))
print(len(neg_processed))

100000
100000


#### Text Representation

In [6]:
embeddings = repre.load_embeddings()

In [7]:
vocab_dict = repre.create_dict_from_provided_vocabulary()

In [8]:
train_pos_path = '../twitter-datasets/train_pos_processed.txt'
pos_features = repre.create_tweet_features(train_pos_path, vocab_dict, shape_of_word_embeddings=20)

In [10]:
train_neg_path = '../twitter-datasets/train_neg_processed.txt'
neg_features = repre.create_tweet_features(train_neg_path, vocab_dict, shape_of_word_embeddings=20)

In [11]:
pos_features = pos_features[1:]
neg_features = neg_features[1:]

In [12]:
pos_shape = pos_features.shape[0]
neg_shape = neg_features.shape[0]

In [13]:
X = np.vstack((pos_features, neg_features))
y = np.zeros(shape=(pos_features.shape[0] + neg_features.shape[0]))
y[:pos_shape] = 1
y[pos_shape:] = -1

In [73]:
np.unique(y)

array([-1.,  1.])

In [116]:
neg_shape

99990

#### Predictions

In [74]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold

In [75]:
clf = LogisticRegression().fit(X, y)

In [76]:
# rs = ShuffleSplit(n_splits=4, test_size=.25, random_state=0)
kf = KFold(n_splits=4, shuffle=True, random_state=0)

In [77]:
scores = cross_val_score(clf, X, y, cv=kf)
print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.5966 (+/- 0.0039)


###### LOAD TEST SET AND PREDICT

In [107]:
test_path = '../twitter-datasets/test_data.txt'

with open(test_path, 'r') as test_in:
    test_lines = test_in.readlines()
    test_in.close()

test_lines = [','.join(line.split(',')[1:]) for line in test_lines]
test_processed = []
for line in test_lines:
    pro_line = line
    pro_line = preproc.remove_unnecessary(pro_line)
    pro_line = preproc.replace_contraction(pro_line)
    pro_line = preproc.replace_numbers(pro_line)
    pro_line = preproc.replace_emoji(pro_line)
#     pro_line = preproc.replace_elongated_word(pro_line)
    test_processed.append(pro_line)
    
helpers.write_file(test_processed, 'test_data_processed')

In [108]:
def create_tweet_features(path_to_file, vocab_dict, shape_of_word_embeddings):
    '''give path to file and vocabulary dictionary
        and return features of the tweets'''
    features = np.empty(shape=shape_of_word_embeddings)
    with open(path_to_file, 'r') as file_in:
        lines = file_in.readlines()    
        for line in lines:
            tweet = line.split(' ')
            tweet_features = np.zeros(shape=(len(tweet), shape_of_word_embeddings))
            for i in range(0, len(tweet)):
                if tweet[i] in vocab_dict:
                    tweet_features[i] = vocab_dict[tweet[i]]
            tweet_features = np.mean(tweet_features, axis=0)
            features = np.vstack((features, tweet_features))
    return features

In [109]:
test_processed_path = '../twitter-datasets/test_data_processed.txt'
test_features = create_tweet_features(test_processed_path, vocab_dict, shape_of_word_embeddings=20)

In [110]:
# test_features

In [111]:
test_features = test_features[1:]
print(test_features.shape)

(10000, 20)


In [112]:
y_pred = clf.predict(test_features)

In [113]:
helpers.create_submission_csv(y_pred)

In [114]:
np.unique(y_pred)

array([-1.,  1.])