In [1]:
from utils import dataset
from utils import nlp
from utils import classifiers

%load_ext autoreload
%autoreload 2

## Parse input

In [2]:
tweets = dataset.read_tweets('DatasetCatSpa')
tweets[:5]

[{'tweet_id': '4717',
  'category': 'SPA',
  'text': "Con el 56,3% escrutado: JPS 62 escaños, C's 25, PSC 17, PP 11, CSQP 10, CUP 10. #eleccionescatalanas"},
 {'tweet_id': '6025',
  'category': 'SPA',
  'text': '.@miqueliceta lxs catalanes necesitan recuperar el estado de bienestar x eso irán hoy a votar #27S https://t.co/2W62g81dQd'},
 {'tweet_id': '6123',
  'category': 'SPA',
  'text': 'Creéis que un sistema judicial catalán sacará a la luz escándalos económicos de la familia Pujol? No, les darán privilegios y lo sabeis #27S'},
 {'tweet_id': '3810',
  'category': 'CAT',
  'text': "Cues per anar a votar, això si que m'agrada!!!! #27S"},
 {'tweet_id': '2193',
  'category': 'CAT',
  'text': 'Amb #SomriureCUP seguint l.escrutini @Elforn. @CUPGirona @cupnacional http://t.co/JfLRkFDOv8'}]

## Pre-processing dataset

In [3]:
data = nlp.run_pipeline(tweets, [
    nlp.remove_old_style_retweet_text,
    nlp.remove_hyperlinks,
    nlp.remove_hashtags,
    nlp.tokenize,
    nlp.reject_stopwords,
    nlp.reject_emoticons,
    nlp.reject_punctuations,
    nlp.stem,
    nlp.bag_of_words
])
data[:2]

[({'56,3': True,
   'escrutado': True,
   'jp': True,
   '62': True,
   'escaño': True,
   "c'": True,
   '25': True,
   'psc': True,
   '17': True,
   'pp': True,
   '11': True,
   'csqp': True,
   '10': True,
   'cup': True,
   'eleccionescatalana': True},
  'SPA'),
 ({'lx': True,
   'catalan': True,
   'necesitan': True,
   'recuperar': True,
   'bienestar': True,
   'x': True,
   'irán': True,
   'hoy': True,
   'votar': True,
   '27': True},
  'SPA')]

## Predicting with a NaiveBayesClassifier

In [4]:
TEST_SIZE = 0.2

result = classifiers.classify_with_naive_bayes(data, TEST_SIZE)
result['accuracy']

0.9132947976878613

## Analysis of the results

In [5]:
classifier = result['classifier']
classifier.show_most_informative_features(10)

Most Informative Features
                       i = True              CAT : SPA    =     61.0 : 1.0
                     per = True              CAT : SPA    =     42.5 : 1.0
                       ¿ = True              SPA : CAT    =     11.9 : 1.0
      eleccionescatalana = True              SPA : CAT    =     11.7 : 1.0
                     dia = True              CAT : SPA    =     10.3 : 1.0
                   l6cat = True              SPA : CAT    =      9.6 : 1.0
              juntspelsi = True              CAT : SPA    =      9.5 : 1.0
                    molt = True              CAT : SPA    =      9.5 : 1.0
                   artur = True              SPA : CAT    =      9.4 : 1.0
                   iceta = True              SPA : CAT    =      7.3 : 1.0
