In [1]:
import pandas as pd
from pathlib import Path

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score, f1_score
from sklearn.model_selection import train_test_split

In [2]:
stopwords = [word.strip() for word in open('stopwords-pt.txt', mode='r', encoding='utf8')]

def load_dataset(filename, train=True):
    raw_df = pd.read_csv(filename)
    if train:
        raw_df = raw_df.rename(columns={'Classificacao': 'label'})
    return raw_df

def describe_dataset(dataframe):
    print(dataframe.label.unique())
    print(dataframe.label.describe())
    print(dataframe.groupby('label')['Text'].count())

def write_predictions(dataframe, predictions, out_path):
    count = 0

    with open(out_path, mode='w', encoding='utf-8') as out_file:
        print('Saving predictions to %s' % out_path)
        out_file.write('Id,Category\n')
        idx = 0
        for result, (idx, row) in zip(predictions, dataframe.iterrows()):
            count += 1
            out_file.write('{},{}\n'.format(row['Id'], result))
            idx += 1
            if count % 100 == 0:
                print('Predicted %d sentences' % count)
    out_file.close()
    print('Finished predicting %d sentences' % count)
    print('Results saved in %s' % Path(out_path).absolute())
    
def train_svm_model(train_df, test_df = None, submission_df = None, submission_name: str = None):
    processed_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                                  ('clf-svm', SGDClassifier(loss='hinge', max_iter=2000, tol=1e-5, random_state=42))])

    processed_clf_svm = processed_clf_svm.fit(train_df['Text'], train_df['label'])
    if test_df is not None:
        predicted_svm = processed_clf_svm.predict(test_df['Text'])
        print(balanced_accuracy_score(test_df['label'], predicted_svm))
    if submission_name is not None:
        predictions = processed_clf_svm.predict(submission_df['Text'])
        write_predictions(submission_df, predictions, 'submissions_' + submission_name + '.csv')
    return processed_clf_svm, predictions

def grid_search_model(train_df):
    pipeline = Pipeline([('vect', CountVectorizer(stop_words=stopwords)), ('tfidf', TfidfTransformer()), 
                         ('clf-svm', SGDClassifier(loss='hinge', max_iter=2000, tol=1e-5, random_state=42))])
    parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 
                  'vect__stop_words': [stopwords, None], 
                  'tfidf__use_idf': (True, False), 
                  'tfidf__norm': ('l1', 'l2'), 
                  'clf-svm__alpha': (1e-2, 1e-3), 
                  'clf-svm__penalty': ('none', 'l2', 'l1', 'elasticnet')}
    gs_clf = GridSearchCV(pipeline, parameters, scoring='balanced_accuracy', n_jobs=-1, cv=10, iid=True, verbose=True)
    gs_clf = gs_clf.fit(train_df['Text'], train_df['label'])
    print(gs_clf.best_score_)
    print(gs_clf.best_params_)
    return gs_clf

In [3]:
train_pd = load_dataset('train.csv')
test_pd = load_dataset('test.csv')

In [4]:
train_pd

Unnamed: 0,Created At,Text,Geo Coordinates.latitude,Geo Coordinates.longitude,User Location,Username,User Screen Name,Retweet Count,label,Observação,Id
0,Mon Jan 09 15:27:43 +0000 2017,Dois são detidos ao tentar jogar celulares e d...,,,,Michele #beta #sdv,michelexmbeta,0,Positivo,,6272
1,Sun Jan 08 02:14:34 +0000 2017,me matan esas minas q cambian 554 veces su fot...,,,Núñez - C.A.B.A.,Gaby Messina,gabymessina36,0,Neutro,,1644
2,Sat Feb 11 09:49:11 +0000 2017,Líderes de motim em presídio de Minas Gerais s...,,,"Hollywood, CA",Wendie Rower,Wendie_Rower,0,Positivo,,7956
3,Thu Jan 05 14:43:03 +0000 2017,#Mídia: Press Release from Business Wire : Di...,,,SP,Marcello Binder,binderbr,0,Neutro,,85
4,Wed Feb 08 22:52:10 +0000 2017,Vacinação contra febre amarela é intensificada...,,,,fodido,eufodeu,0,Positivo,,6006
...,...,...,...,...,...,...,...,...,...,...,...
6554,Thu Jan 26 14:31:45 +0000 2017,Rio faz bloqueio contra febre amarela em munic...,,,Goiás - Brasil,Altair Tavares,altairtavares,0,Positivo,,5735
6555,Fri Feb 10 18:13:01 +0000 2017,Governador Fernando Pimentel entrega 401 veícu...,,,Santos Dumont - MG -Brasil,POSSANTE ON LINE,possanteonline,0,Positivo,,5192
6556,Thu Jan 05 17:19:20 +0000 2017,Secretaria de Educação faz reformulações para ...,,,Minas Gerais - Brasil,Uberlândia,PrefeituraUdia,0,Positivo,,5391
6557,Thu Jan 05 14:15:02 +0000 2017,E governo ainda quer indenizar a família dos b...,,,,Graça Azeredo,azeredo_mg,0,Neutro,,861


In [5]:
describe_dataset(train_pd)

['Positivo' 'Neutro' 'Negativo']
count         6559
unique           3
top       Positivo
freq          2639
Name: label, dtype: object
label
Negativo    1970
Neutro      1950
Positivo    2639
Name: Text, dtype: int64


In [6]:
train_split_pd, test_split_pd = train_test_split(train_pd, stratify=train_pd['label'], test_size=0.2)

In [7]:
# train_split_pd[['Text', 'label']].rename(columns={'Text': 'text'}).to_json('train.json', orient='records', lines=True)
# test_split_pd[['Text', 'label']].rename(columns={'Text': 'text'}).to_json('test.json', orient='records', lines=True)
# train_pd[['Text', 'label']].rename(columns={'Text': 'text'}).to_json('train_full.json', orient='records', lines=True)
# test_pd[['Id', 'Text']].rename(columns={'Text': 'text'}).to_json('test_submission.json', orient='records', lines=True)

In [10]:
train_split_pd.to_csv('train_split.csv', index=None)
test_split_pd.to_csv('test_split.csv', index=None)

In [11]:
!head -n 2 train_split.csv

Created At,Text,Geo Coordinates.latitude,Geo Coordinates.longitude,User Location,Username,User Screen Name,Retweet Count,label,Observação,Id
Wed Jan 04 21:42:14 +0000 2017,RT @carlinibh: Cavalo emociona ao se despedir de dono em enterro - Nacional - Estado de Minas https://t.co/8IA3RQ8v99,,,BRASIL,RICARDO COSTA,RICARDO_COSTA01,3,Neutro,,2809


In [8]:
describe_dataset(train_split_pd)

['Neutro' 'Positivo' 'Negativo']
count         5247
unique           3
top       Positivo
freq          2111
Name: label, dtype: object
label
Negativo    1576
Neutro      1560
Positivo    2111
Name: Text, dtype: int64


In [9]:
describe_dataset(test_split_pd)

['Neutro' 'Positivo' 'Negativo']
count         1312
unique           3
top       Positivo
freq           528
Name: label, dtype: object
label
Negativo    394
Neutro      390
Positivo    528
Name: Text, dtype: int64


In [14]:
grid_search_model = grid_search_model(train_pd)

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   32.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 1280 out of 1280 | elapsed:  5.8min finished


0.9630562219687785
{'clf-svm__alpha': 0.001, 'clf-svm__penalty': 'none', 'tfidf__norm': 'l2', 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2), 'vect__stop_words': None}


In [19]:
grid_search_model.best_params_

{'clf-svm__alpha': 0.001,
 'clf-svm__penalty': 'none',
 'tfidf__norm': 'l2',
 'tfidf__use_idf': False,
 'vect__ngram_range': (1, 2),
 'vect__stop_words': None}

In [20]:
best_svm = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer(use_idf=False)), 
                     ('clf-svm', SGDClassifier(loss='hinge', alpha=0.001, penalty=None, max_iter=2000, tol=1e-5, random_state=42))])

best_svm = best_svm.fit(train_pd['Text'], train_pd['label'])
predictions = best_svm.predict(test_pd['Text'])
write_predictions(test_pd, predictions, 'submissions_gridsearch.csv')

Saving predictions to submissions_gridsearch.csv
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Finished predicting 1640 sentences
Results saved in /media/discoD/repositorios/deeplearningufg/nlp/competicao1/submissions_gridsearch.csv


In [22]:
model, predictions = train_svm_model(train_split_pd, test_split_pd, test_pd, 'no_stopwords_svm_tfidf')

0.9515114045570897
Saving predictions to submissions_no_stopwords_svm_tfidf.csv
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Finished predicting 1640 sentences
Results saved in /media/discoD/repositorios/deeplearningufg/nlp/competicao1/submissions_no_stopwords_svm_tfidf.csv


In [55]:
model, predictions = train_svm_model(train_split_pd, test_split_pd, test_pd, 'stopwords_svm_tfidf')

0.9515849632600902
Saving predictions to submissions_stopwords_svm_tfidf.csv
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Finished predicting 1640 sentences
Results saved in /media/discoD/repositorios/deeplearningufg/nlp/competicao1/submissions_stopwords_svm_tfidf.csv


In [59]:
model, predictions = train_svm_model(train_df=train_pd, submission_df=test_pd, submission_name='stopwords_svm_tfidf_full')

Saving predictions to submissions_stopwords_svm_tfidf_full.csv
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Finished predicting 1640 sentences
Results saved in /media/discoD/repositorios/deeplearningufg/nlp/competicao1/submissions_stopwords_svm_tfidf_full.csv


In [5]:
test_pd

Unnamed: 0,Created At,Text,Geo Coordinates.latitude,Geo Coordinates.longitude,User Location,Username,User Screen Name,Retweet Count,Observação,Id
0,Thu Jan 05 12:00:34 +0000 2017,RT @JDanieldf: Pedindo para que MG reaja? Reag...,,,Balneário Camboriú - SC,Mirela Franz,MiLick74,27,,3568
1,Fri Jan 06 11:54:50 +0000 2017,Homem que matou ex-mulher e jogou corpo em cis...,,,Belo Horizonte MG Brasil,Ricardo Carlini,carlinibh,1,,1323
2,Sat Feb 11 15:51:14 +0000 2017,"New post: ""Três adolescentes são apreendidos p...",,,,Camila Maciel Serrão,CamilaMacielSer,0,,7976
3,Wed Jan 04 18:08:43 +0000 2017,RT @AnaPaulaVolei: Mais 2 helicópteros!!A cara...,,,Sao Paulo,CLAUDIA DELAFIORI,cdelafiori,444,,2408
4,Wed Jan 04 18:12:12 +0000 2017,"RT @UOLNoticias: Custaram R$ 21,8 milhões: Mes...",,,Jaboatão dos Guararapes,Rodrigo Calabria,CalabriaRodrigo,141,,4435
...,...,...,...,...,...,...,...,...,...,...
1635,Sat Jan 07 12:51:37 +0000 2017,RT @ivo123zarate3: Me hace mal ver en instagra...,,,M e r c e d e s (Ctes),Ere Esse♡,RamoohSilvero,12,,3536
1636,Wed Jan 25 14:18:35 +0000 2017,@PMMG190 - Militares da 22ª Cia prendem autore...,,,Minas Gerais,POLÍCIA MILITAR MG,pmmg190,0,,6881
1637,Mon Jan 09 11:18:11 +0000 2017,Cadeia em Manaus tem 4 mortos; Estados pedem a...,,,,Lenilda Miranda,nilda_ap,0,,627
1638,Fri Jan 06 13:02:26 +0000 2017,Reforma da Previdência será feita no primeiro ...,,,Brasil,marli silvera dziadz,marlidzdz59,0,,2165
