In [1]:
from utils import dataset
from utils import nlp
from utils import classifiers

%load_ext autoreload
%autoreload 2

## Parse input

In [2]:
tweets = dataset.read_tweets('DatasetMaFe')
tweets[:5]

[{'tweet_id': '4120',
  'category': 'MALE',
  'text': 'A Catalunya hi ha més votants del PP que de la CUP. #eleccionescatalanas'},
 {'tweet_id': '7605',
  'category': 'FEMALE',
  'text': 'Récord histórico de participación en los comicios catalanes #27s #jornadahistórica #eleccionescatalanas'},
 {'tweet_id': '3656',
  'category': 'MALE',
  'text': "A #santsadurni Meses 4/19 JxSi: 702 CUP: 153 CSQEP: 141 C's: 183 PSC: 230 Pp: 103 Unió: 27 Altres: 5 Blanc: 1 Nuls: 1 #GuanyemJunts"},
 {'tweet_id': '1975',
  'category': 'FEMALE',
  'text': 'Avui sereu determinants. #27S2015 #27s #27SEbre #catalunya #independència #iaios #catalonia #vote… https://t.co/3z0mFaO8UB'},
 {'tweet_id': '4349',
  'category': 'MALE',
  'text': 'De momento Catalunya si que es pot es un hostiazo estrepitoso si es que el que mucho abarca poco aprieta #eleccionescatalanas'}]

## Pre-processing dataset

In [3]:
pipeline = [
    nlp.remove_old_style_retweet_text,
    nlp.remove_hyperlinks,
    nlp.remove_hashtags,
    nlp.tokenize,
    nlp.reject_stopwords,
    nlp.reject_emoticons,
    nlp.reject_punctuations,
    nlp.stem
]

stem_sentences = [nlp.process_unit(pipeline, tweet['text']) for tweet in tweets]
stem_sentences[:2]

[['catalunya', 'hi', 'mé', 'votant', 'pp', 'cup', 'eleccionescatalana'],
 ['récord',
  'histórico',
  'participación',
  'comicio',
  'catalan',
  '27',
  'jornadahistórica',
  'eleccionescatalana']]

In [4]:
result = classifiers.bag_of_words_for_svm(stem_sentences)
print(result['vocabulary_size'])

sentences = result['sentences']
sentences[0]

3044


array([0., 0., 0., ..., 0., 0., 0.])

In [5]:
labels = [tweet['category'] for tweet in tweets]
labels[:3]

['MALE', 'FEMALE', 'MALE']

## Predicting with a SVM

In [6]:
TEST_SIZE = 0.33
MAX_ITERATIONS = 500
X = sentences
y = labels

result = classifiers.classify_with_svm(X, y, TEST_SIZE, MAX_ITERATIONS)
result



{'total_correct': 166, 'accuracy': 0.5804195804195804}