In [1]:
from utils import dataset
from utils import nlp
from utils import classifiers

%load_ext autoreload
%autoreload 2

## Parse input

In [2]:
tweets = dataset.read_tweets('DatasetFavCon')
tweets[:5]

[{'tweet_id': '5956',
  'category': 'NEUTRAL',
  'text': 'PUBLICIDAD SUBLIMINAL http://t.co/Tj1FJI0R1l http://t.co/LbjFbyqI1i vía @gaceta_es #27S #Catalunya'},
 {'tweet_id': '5436',
  'category': 'AGAINST',
  'text': '@CiudadanosCs de #Cataluña, gracias por "resistir" y trabajar 365 días con mano tendida para construir, no para romper nada. #ApoderadosCs'},
 {'tweet_id': '6083',
  'category': 'NEUTRAL',
  'text': 'Hay vida más allá del #27S #firamedievalspm # santaperpetua #mascosta http://t.co/50lS2VRVTG'},
 {'tweet_id': '4405',
  'category': 'NEUTRAL',
  'text': 'viendo en VIVO en #Periscope: Sede Ciudadanos:hablamos con los militantes #27S https://t.co/xjXDtX11bH'},
 {'tweet_id': '7920', 'category': 'NEUTRAL', 'text': 'Paciencia #27S'}]

## Pre-processing dataset

In [3]:
pipeline = [
    nlp.remove_old_style_retweet_text,
    nlp.remove_hyperlinks,
    nlp.remove_hashtags,
    nlp.tokenize,
    nlp.reject_stopwords,
    nlp.reject_emoticons,
    nlp.reject_punctuations,
    nlp.stem
]

stem_sentences = [nlp.process_unit(pipeline, tweet['text']) for tweet in tweets]
stem_sentences[:2]

[['publicidad', 'sublimin'],
 ['cataluña',
  'gracia',
  'resistir',
  'trabajar',
  '365',
  'día',
  'mano',
  'tendida',
  'construir',
  'romper',
  'apoderadosc']]

In [4]:
result = classifiers.bag_of_words_for_svm(stem_sentences)
print(result['vocabulary_size'])

sentences = result['sentences']
sentences[0]

3025


array([0., 0., 0., ..., 0., 0., 0.])

In [5]:
labels = [tweet['category'] for tweet in tweets]
labels[:3]

['NEUTRAL', 'AGAINST', 'NEUTRAL']

## Predicting with a SVM

In [6]:
TEST_SIZE = 0.33
MAX_ITERATIONS = 500
X = sentences
y = labels

result = classifiers.classify_with_svm(X, y, TEST_SIZE, MAX_ITERATIONS)
result



{'total_correct': 189, 'accuracy': 0.6608391608391608}