In [1]:
from utils import dataset
from utils import nlp
from utils import classifiers

%load_ext autoreload
%autoreload 2

## Parse input

In [2]:
tweets = dataset.read_tweets('DatasetMaFe')
tweets[:5]

[{'tweet_id': '4120',
  'category': 'MALE',
  'text': 'A Catalunya hi ha més votants del PP que de la CUP. #eleccionescatalanas'},
 {'tweet_id': '7605',
  'category': 'FEMALE',
  'text': 'Récord histórico de participación en los comicios catalanes #27s #jornadahistórica #eleccionescatalanas'},
 {'tweet_id': '3656',
  'category': 'MALE',
  'text': "A #santsadurni Meses 4/19 JxSi: 702 CUP: 153 CSQEP: 141 C's: 183 PSC: 230 Pp: 103 Unió: 27 Altres: 5 Blanc: 1 Nuls: 1 #GuanyemJunts"},
 {'tweet_id': '1975',
  'category': 'FEMALE',
  'text': 'Avui sereu determinants. #27S2015 #27s #27SEbre #catalunya #independència #iaios #catalonia #vote… https://t.co/3z0mFaO8UB'},
 {'tweet_id': '4349',
  'category': 'MALE',
  'text': 'De momento Catalunya si que es pot es un hostiazo estrepitoso si es que el que mucho abarca poco aprieta #eleccionescatalanas'}]

## Pre-processing dataset

In [3]:
data = nlp.run_pipeline(tweets, [
    nlp.remove_old_style_retweet_text,
    nlp.remove_hyperlinks,
    nlp.remove_hashtags,
    nlp.tokenize,
    nlp.reject_stopwords,
    nlp.reject_emoticons,
    nlp.reject_punctuations,
    nlp.stem,
    nlp.bag_of_words
])
data[:2]

[({'catalunya': True,
   'hi': True,
   'mé': True,
   'votant': True,
   'pp': True,
   'cup': True,
   'eleccionescatalana': True},
  'MALE'),
 ({'récord': True,
   'histórico': True,
   'participación': True,
   'comicio': True,
   'catalan': True,
   '27': True,
   'jornadahistórica': True,
   'eleccionescatalana': True},
  'FEMALE')]

## Predicting with a NaiveBayesClassifier

In [4]:
TEST_SIZE = 0.2

result = classifiers.classify_with_naive_bayes(data, TEST_SIZE)
result['accuracy']

0.6127167630057804

## Analysis of the results

In [5]:
classifier = result['classifier']
classifier.show_most_informative_features(10)

Most Informative Features
                   csqep = True             MALE : FEMALE =      6.6 : 1.0
                     fer = True             MALE : FEMALE =      5.9 : 1.0
                 colegio = True           FEMALE : MALE   =      4.8 : 1.0
                      63 = True             MALE : FEMALE =      4.5 : 1.0
                     pue = True             MALE : FEMALE =      4.5 : 1.0
                       7 = True             MALE : FEMALE =      4.5 : 1.0
       elvotdelamevavida = True           FEMALE : MALE   =      4.2 : 1.0
                  seguir = True           FEMALE : MALE   =      4.2 : 1.0
                escrutat = True             MALE : FEMALE =      3.9 : 1.0
                   poden = True             MALE : FEMALE =      3.8 : 1.0
