In [1]:
import numpy as np
import pandas as pd
import text_representation as repre
import helpers
import preprocessing as preproc
import time

#### Preprocessing

In [2]:
preproc.do_preprocessing('../twitter-datasets/train_pos.txt')
preproc.do_preprocessing('../twitter-datasets/train_neg.txt')

100%|██████████| 100000/100000 [02:57<00:00, 562.13it/s]
100%|██████████| 100000/100000 [03:28<00:00, 478.89it/s]


In [3]:
lines, y = preproc.return_processed_trainset_and_y(False)

#### Text Representation

In [5]:
lines

['dunoduno justinjustin read mymy mention .. justinjustin andand god knows thatthat ,, hope youyou follow #believe#believe\n',
 'becausebecause youryour logic dumb ,, even crop youryour name youryour photo .. tsk ..\n',
 '"" put casper box !! "" lovedloved thethe battle !! #crakbitch#crakbitch\n',
 'thanks sir >> >> trip lillil mama .. keep doindoin yaya thangthang !!\n',
 'visiting mymy brother tmrtmr thethe bestest birthday gift everever !! !! !!\n',
 'yay !! !! #lifecompleted#lifecompleted .. tweet // facebokfacebok toto let know please\n',
 '## dnextalbumtitlednextalbumtitle :: feel forfor youyou // rolercoasterrolercoaster ofof life .. song coceptcocept :: life ,, #yolo#yolo ,, becoming famous ?? << #folowmeplz#folowmeplz !! << x\n',
 'workinworkin hard hardly workinworkin rtrt hardeharde withwith mymy future coworkercoworker\n',
 'saw .. replying bit ..\n',
 'thisthis belong\n',
 'andand toto cheer #nationals#nationals ??\n',
 'wewe send invitation toto shop on-line !! youyou fin

In [None]:
start_time = time.time()

In [None]:
embeddings = repre.load_embeddings()
vocab_dict = repre.create_dict_from_provided_vocabulary()

In [None]:
train_pos_path = '../twitter-datasets/train_pos_processed.txt'
pos_features = repre.create_tweet_features(train_pos_path, vocab_dict, shape_of_word_embeddings=20)

In [None]:
train_neg_path = '../twitter-datasets/train_neg_processed.txt'
neg_features = repre.create_tweet_features(train_neg_path, vocab_dict, shape_of_word_embeddings=20)

In [None]:
pos_features = pos_features[1:]
neg_features = neg_features[1:]

In [None]:
pos_shape = pos_features.shape[0]
neg_shape = neg_features.shape[0]

In [None]:
X = np.vstack((pos_features, neg_features))
y = np.zeros(shape=(pos_features.shape[0] + neg_features.shape[0]))
y[:pos_shape] = 1
y[pos_shape:] = -1

In [None]:
np.unique(y)

In [None]:
elapsed_time = divmod(round((time.time() - start_time)), 60)
print('------\nElapsed time: {m} min {s} sec\n'.format(m=elapsed_time[0], s=elapsed_time[1]))

#### Predictions

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(lines)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold

In [None]:
clf = LogisticRegression().fit(X, y)

In [None]:
# rs = ShuffleSplit(n_splits=4, test_size=.25, random_state=0)
kf = KFold(n_splits=4, shuffle=True, random_state=0)

In [None]:
scores = cross_val_score(clf, X, y, cv=kf)
print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

###### LOAD TEST SET AND PREDICT

In [None]:
preproc.do_preprocessing('../twitter-datasets/test_data.txt', test_file=True)

In [None]:
test_processed_path = '../twitter-datasets/test_data_processed.txt'
test_features = repre.create_test_tweet_features(test_processed_path, vocab_dict, shape_of_word_embeddings=20)

In [None]:
test_features = test_features[1:]
print(test_features.shape)

In [None]:
y_pred = clf.predict(test_features)

In [None]:
helpers.create_submission_csv(y_pred)

In [None]:
np.unique(y_pred)

In [None]:
with open('../twitter-datasets/train_pos.txt') as pos_in, open(
                            '../twitter-datasets/train_neg.txt') as neg_in:
    pos_lines = pos_in.readlines()
    neg_lines = neg_in.readlines()
    pos_in.close()
    neg_in.close()
lines = pos_lines + neg_lines

In [None]:
y = np.zeros(shape=(len(lines)))
y[:len(pos_lines)] = 1
y[len(pos_lines):] = -1

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(pos_lines[:5])
print(vectorizer.get_feature_names())
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
print(X.shape)

In [None]:
vectorizer = TfidfVectorizer(lowercase=True, use_idf=True, max_features=None,
                             stop_words='english', norm= 'l2', ngram_range=(1,1), sublinear_tf=True)

In [None]:
X = vectorizer.fit_transform(lines)

In [None]:
np.unique(y)

In [None]:
scores_trees = []
num_of_trees = list(range(10, 500, 50))
for trees in num_of_trees:
    forest = RandomForestClassifier(n_estimators=trees, random_state=4, n_jobs=-1)
    print('Running RF ' + str(trees) + ' trees...')
    forest.fit(X_train, y_train)
    y_pred = forest.predict(X_validation)
    diff = metrics.accuracy_score(y_validation, y_pred)
    scores_trees.append(diff)
    print(diff)