In [1]:
import numpy as np
import pandas as pd
import text_representation as repre
import helpers
import preprocessing as preproc
import time

#### Preprocessing

In [None]:
preproc.do_preprocessing('../twitter-datasets/train_pos.txt')
preproc.do_preprocessing('../twitter-datasets/train_neg.txt')

In [2]:
lines, y = preproc.return_processed_trainset_and_y(False)

#### Text Representation

In [3]:
start_time = time.time()

In [None]:
embeddings = repre.load_embeddings()
vocab_dict = repre.create_dict_from_provided_vocabulary()

In [None]:
train_pos_path = '../twitter-datasets/train_pos_processed.txt'
pos_features = repre.create_tweet_features(train_pos_path, vocab_dict, shape_of_word_embeddings=20)

In [None]:
train_neg_path = '../twitter-datasets/train_neg_processed.txt'
neg_features = repre.create_tweet_features(train_neg_path, vocab_dict, shape_of_word_embeddings=20)

In [None]:
pos_features = pos_features[1:]
neg_features = neg_features[1:]

In [None]:
pos_shape = pos_features.shape[0]
neg_shape = neg_features.shape[0]

In [None]:
X = np.vstack((pos_features, neg_features))
y = np.zeros(shape=(pos_features.shape[0] + neg_features.shape[0]))
y[:pos_shape] = 1
y[pos_shape:] = -1

In [None]:
np.unique(y)

In [None]:
elapsed_time = divmod(round((time.time() - start_time)), 60)
print('------\nElapsed time: {m} min {s} sec\n'.format(m=elapsed_time[0], s=elapsed_time[1]))

#### Predictions

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(lines)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold

In [None]:
clf = LogisticRegression().fit(X, y)

In [None]:
# rs = ShuffleSplit(n_splits=4, test_size=.25, random_state=0)
kf = KFold(n_splits=4, shuffle=True, random_state=0)

In [None]:
scores = cross_val_score(clf, X, y, cv=kf)
print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

###### LOAD TEST SET AND PREDICT

In [None]:
preproc.do_preprocessing('../twitter-datasets/test_data.txt', test_file=True)

In [None]:
test_processed_path = '../twitter-datasets/test_data_processed.txt'
test_features = repre.create_test_tweet_features(test_processed_path, vocab_dict, shape_of_word_embeddings=20)

In [None]:
test_features = test_features[1:]
print(test_features.shape)

In [None]:
y_pred = clf.predict(test_features)

In [None]:
helpers.create_submission_csv(y_pred)

In [None]:
np.unique(y_pred)