In [1]:
from utils import dataset
from utils import nlp
from utils import classifiers

%load_ext autoreload
%autoreload 2

## Parse input

In [2]:
tweets = dataset.read_tweets('DatasetCatSpa')
tweets[:5]

[{'tweet_id': '4717',
  'category': 'SPA',
  'text': "Con el 56,3% escrutado: JPS 62 escaños, C's 25, PSC 17, PP 11, CSQP 10, CUP 10. #eleccionescatalanas"},
 {'tweet_id': '6025',
  'category': 'SPA',
  'text': '.@miqueliceta lxs catalanes necesitan recuperar el estado de bienestar x eso irán hoy a votar #27S https://t.co/2W62g81dQd'},
 {'tweet_id': '6123',
  'category': 'SPA',
  'text': 'Creéis que un sistema judicial catalán sacará a la luz escándalos económicos de la familia Pujol? No, les darán privilegios y lo sabeis #27S'},
 {'tweet_id': '3810',
  'category': 'CAT',
  'text': "Cues per anar a votar, això si que m'agrada!!!! #27S"},
 {'tweet_id': '2193',
  'category': 'CAT',
  'text': 'Amb #SomriureCUP seguint l.escrutini @Elforn. @CUPGirona @cupnacional http://t.co/JfLRkFDOv8'}]

## Pre-processing dataset

In [3]:
pipeline = [
    nlp.remove_old_style_retweet_text,
    nlp.remove_hyperlinks,
    nlp.remove_hashtags,
    nlp.tokenize,
    nlp.reject_stopwords,
    nlp.reject_emoticons,
    nlp.reject_punctuations,
    nlp.stem
]

stem_sentences = [nlp.process_unit(pipeline, tweet['text']) for tweet in tweets]
stem_sentences[:2]

[['56,3',
  'escrutado',
  'jp',
  '62',
  'escaño',
  "c'",
  '25',
  'psc',
  '17',
  'pp',
  '11',
  'csqp',
  '10',
  'cup',
  '10',
  'eleccionescatalana'],
 ['lx',
  'catalan',
  'necesitan',
  'recuperar',
  'bienestar',
  'x',
  'irán',
  'hoy',
  'votar',
  '27']]

In [4]:
result = classifiers.bag_of_words_for_svm(stem_sentences)
print(result['vocabulary_size'])

sentences = result['sentences']
sentences[0]

3124


array([0., 0., 0., ..., 0., 0., 0.])

In [5]:
labels = [tweet['category'] for tweet in tweets]
labels[:3]

['SPA', 'SPA', 'SPA']

## Predicting with a SVM

In [6]:
TEST_SIZE = 0.33
MAX_ITERATIONS = 500
X = sentences
y = labels

result = classifiers.classify_with_svm(X, y, TEST_SIZE, MAX_ITERATIONS)
result



{'total_correct': 265, 'accuracy': 0.9265734265734266}