In [27]:
import re
import csv
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [28]:
targets = [
    'No', # Not related to JBS delations
    'Yes' # Related to JBS delations
]

RE_NUMBERS = re.compile(r'[0-9]+')
RE_NOT_LETTERS = re.compile(r'[^a-zA-Z]+')
RE_SPACES = re.compile(r'[^\S\f\t\n\r]+')

def clean_text(text):
    text = RE_NUMBERS.sub(' NUM ', text)
    text = RE_NOT_LETTERS.sub(' ', text)
    text = RE_SPACES.sub(' ', text)
    return text.lower()

In [29]:
rows = []
with open('tweets_1000_day/all_tweets_labeled_2.csv', newline='', encoding='utf-8') as csvfile:
    for row in csvfile.readlines():
        rows += [row.split(';')]

In [32]:
# Cleaning tweets
tweets = [[row[0], clean_text(row[1]), row[2].replace('\r\n','')] for row in rows]
tweets_text = [tweet[1] for tweet in tweets]
print('Tweets\' set size: ' + str(len(tweets)))

Tweets' set size: 243002


In [39]:
# Tweet takes part on training set if it is labeled (third column)
tweets_training_set = [tweet for tweet in tweets if tweet[2] != '']
tweets_training_set_text = [tweet[1] for tweet in tweets_training_set]
tweets_training_set_target = [int(tweet[2]) for tweet in tweets_training_set]
print('Training set size: ' + str(len(tweets_training_set)))

Training set size: 454


In [40]:
# Bag of words
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(tweets_training_set_text)
print(x_train_counts.shape)

(454, 2083)


In [41]:
# Machine Learning
# Training Naive Bayes (NB) classifier on training data.
clf = MultinomialNB().fit(x_train_counts, tweets_training_set_target)

In [42]:
# Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:
# The names ‘vect’ and ‘clf’ are arbitrary but will be used later.
# We will be using the 'text_clf' going forward.
text_clf = Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())])
text_clf = text_clf.fit(tweets_training_set_text, tweets_training_set_target)

In [44]:
# Performance of NB Classifier (on training set)
predicted = text_clf.predict(tweets_training_set_text)
acc = np.mean(predicted == tweets_training_set_target)
print(acc)

0.911894273128
