In [13]:
import utils.functions as fun
from utils import dataset
from utils import nlp

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Parse input

In [14]:
tweets = dataset.read_tweets('DatasetCatSpa')
tweets[:5]

[{'tweet_id': '1395',
  'category': 'CAT',
  'text': 'Gallina de piel. Emocionada. #SomLaHistoria #27S #catalunya #democracia'},
 {'tweet_id': '2931',
  'category': 'CAT',
  'text': 'Bon dia, aquest #Diumenge és molt millor que qualsevol #Divendres. Somrient. #27S'},
 {'tweet_id': '6246',
  'category': 'SPA',
  'text': '#27s al otro lado de las urnas... https://t.co/lHU7QkdWF2'},
 {'tweet_id': '2323',
  'category': 'CAT',
  'text': "9:03 cua fins la cantonada per votar a Dominiques de l' ensenyament, carrer Mallorca. La força d'un poble! #27S #27S2015"},
 {'tweet_id': '2091',
  'category': 'CAT',
  'text': 'Acabo de rebre les paperetes per anar al Cons Esp a votar abans de les 14h del 25S. Gràcies ESP. Visca la vostra democràcia. #VotaPerMi #27S'}]

## Pre-processing dataset

In [15]:
pipeline = [
    nlp.remove_old_style_retweet_text,
    nlp.remove_hyperlinks,
    nlp.remove_hashtags,
    nlp.tokenize,
    nlp.reject_stopwords,
    nlp.reject_emoticons,
    nlp.reject_punctuations,
    nlp.stem
]

stem_sentences = [nlp.process_unit(pipeline, tweet['text']) for tweet in tweets]
stem_sentences[:2]

[['gallina',
  'piel',
  'emocionada',
  'somlahistoria',
  '27',
  'catalunya',
  'democracia'],
 ['bon',
  'dia',
  'aquest',
  'diumeng',
  'és',
  'molt',
  'millor',
  'qualsevol',
  'divendr',
  'somrient',
  '27']]

In [16]:
result = fun.bag_of_words_for_svm(stem_sentences)
print(result['vocabulary_size'])

sentences = result['sentences']
sentences[0]

13911


array([0., 0., 0., ..., 0., 0., 0.])

In [17]:
labels = [tweet['category'] for tweet in tweets]
labels[:3]

['CAT', 'CAT', 'SPA']

## Predicting with a SVM

In [18]:
TEST_SIZE = 0.33
MAX_ITERATIONS = 500
X = sentences
y = labels

result = fun.classify_with_svm(X, y, TEST_SIZE, MAX_ITERATIONS)
result



{'total_correct': 2645, 'accuracy': 0.9514388489208633}