In [1]:
import numpy as np
import pandas as pd
import text_representation as repre
import helpers
import preprocessing as preproc
import time

#### Preprocessing

In [2]:
preproc.do_preprocessing('../twitter-datasets/train_pos.txt')
preproc.do_preprocessing('../twitter-datasets/train_neg.txt')

#### Text Representation

In [4]:
start_time = time.time()

embeddings = repre.load_embeddings()
vocab_dict = repre.create_dict_from_provided_vocabulary()

In [5]:
train_pos_path = '../twitter-datasets/train_pos_processed.txt'
pos_features = repre.create_tweet_features(train_pos_path, vocab_dict, shape_of_word_embeddings=20)

In [6]:
train_neg_path = '../twitter-datasets/train_neg_processed.txt'
neg_features = repre.create_tweet_features(train_neg_path, vocab_dict, shape_of_word_embeddings=20)

In [7]:
pos_features = pos_features[1:]
neg_features = neg_features[1:]

In [8]:
pos_shape = pos_features.shape[0]
neg_shape = neg_features.shape[0]

In [9]:
X = np.vstack((pos_features, neg_features))
y = np.zeros(shape=(pos_features.shape[0] + neg_features.shape[0]))
y[:pos_shape] = 1
y[pos_shape:] = -1

In [10]:
np.unique(y)

array([-1.,  1.])

In [11]:
neg_shape

99985

In [12]:
elapsed_time = divmod(round((time.time() - start_time)), 60)
print('------\nElapsed time: {m} min {s} sec\n'.format(m=elapsed_time[0], s=elapsed_time[1]))

------
Elapsed time: 3 min 22 sec



#### Predictions

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold

In [14]:
clf = LogisticRegression().fit(X, y)

In [15]:
# rs = ShuffleSplit(n_splits=4, test_size=.25, random_state=0)
kf = KFold(n_splits=4, shuffle=True, random_state=0)

In [16]:
scores = cross_val_score(clf, X, y, cv=kf)
print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.5992 (+/- 0.0033)


###### LOAD TEST SET AND PREDICT

In [17]:
preproc.do_preprocessing('../twitter-datasets/test_data.txt', test_file=True)

In [18]:
test_processed_path = '../twitter-datasets/test_data_processed.txt'
test_features = repre.create_test_tweet_features(test_processed_path, vocab_dict, shape_of_word_embeddings=20)

In [19]:
test_features = test_features[1:]
print(test_features.shape)

(10000, 20)


In [20]:
y_pred = clf.predict(test_features)

In [21]:
helpers.create_submission_csv(y_pred)

In [22]:
np.unique(y_pred)

array([-1.,  1.])

In [4]:
with open('../twitter-datasets/train_pos.txt') as pos_in, open(
                            '../twitter-datasets/train_neg.txt') as neg_in:
    pos_lines = pos_in.readlines()
    neg_lines = neg_in.readlines()
    pos_in.close()
    neg_in.close()
lines = pos_lines + neg_lines

In [24]:
y = np.zeros(shape=(len(lines)))
y[:len(pos_lines)] = 1
y[len(pos_lines):] = -1

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(pos_lines[:5])
print(vectorizer.get_feature_names())
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
print(X.shape)

['15', 'about', 'and', 'battle', 'because', 'believe', 'bestest', 'birthday', 'box', 'brother', 'but', 'casper', 'crakkbitch', 'crop', 'doin', 'don', 'dumb', 'dunno', 'eveerrr', 'even', 'follow', 'gift', 'god', 'hope', 'in', 'is', 'just', 'justin', 'keep', 'knows', 'lil', 'logic', 'looved', 'mama', 'me', 'mention', 'my', 'name', 'not', 'only', 'or', 'out', 'photo', 'put', 'read', 'sir', 'so', 'thang', 'thanks', 'that', 'the', 'tmr', 'trip', 'tsk', 'url', 'user', 'visiting', 'will', 'won', 'ya', 'you', 'your']
(5, 62)


In [21]:
vectorizer = TfidfVectorizer(lowercase=True, use_idf=True, max_features=None,
                             stop_words='english', norm= 'l2', ngram_range=(1,1), sublinear_tf=True)

In [22]:
X = vectorizer.fit_transform(lines)

In [25]:
np.unique(y)

array([-1.,  1.])

In [None]:
scores_trees = []
num_of_trees = list(range(10, 500, 50))
for trees in num_of_trees:
    forest = RandomForestClassifier(n_estimators=trees, random_state=4, n_jobs=-1)
    print('Running RF ' + str(trees) + ' trees...')
    forest.fit(X_train, y_train)
    y_pred = forest.predict(X_validation)
    diff = metrics.accuracy_score(y_validation, y_pred)
    scores_trees.append(diff)
    print(diff)