In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import normalize
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import chi2
from sklearn.model_selection import cross_val_score

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from pathlib import Path

import pickle, re
import nltk

nltk.download('rslp')

from nltk import word_tokenize
from io import StringIO
import numpy as np

[nltk_data] Downloading package rslp to /home/pedro/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


In [60]:
validation_data_path = 'df_valid.jsonl'
train_data_path = 'df_train.jsonl'

In [61]:
train_df = pd.read_json(train_data_path, orient='records', lines=True)
dev_df = pd.read_json(validation_data_path, orient='records', lines=True)

In [62]:
train_df

Unnamed: 0,label,text
0,comida,Casa da Barra Funda tem clima roceiro e receit...
1,educacao,Professores de SP decidem manter greve; grupo ...
2,empreendedorsocial,"Em segunda edição, concurso paga R$ 35 mil par..."
3,equilibrioesaude,Usar maconha por anos não faz tão mal para a s...
4,ciencia,Baleia-azul percorre 5.200 km e revela a cient...
5,empreendedorsocial,Líderes inovadores se reunem em rede para comp...
6,turismo,"Conheça Pandora, atração milionária da Disney ..."
7,empreendedorsocial,Fiesp organiza quarta edição de maratona hacke...
8,turismo,Praia do Forte mistura natureza com resorts es...
9,turismo,App de agência avisa quando visto e passaporte...


In [63]:
count_vect = CountVectorizer()

In [64]:
train_counts = count_vect.fit_transform(train_df['text'])
train_counts.shape

(7894, 94918)

In [65]:
tfidf_transformer = TfidfTransformer()

In [66]:
train_tfidf = tfidf_transformer.fit_transform(train_counts)
train_tfidf.shape

(7894, 94918)

In [67]:
stopwords = [word.strip() for word in open('stopwords-pt.txt', mode='r', encoding='utf8')]
stemmer = nltk.stem.RSLPStemmer()

# Punctuation list
punctuations = re.escape('!"#%\'()*+,./:;<=>?@[\\]^_`{|}~')

# ##### #
# Regex #
# ##### #
re_remove_brackets = re.compile(r'\{.*\}')
re_remove_html = re.compile(r'<(\/|\\)?.+?>', re.UNICODE)
re_transform_numbers = re.compile(r'\d', re.UNICODE)
re_transform_emails = re.compile(r'[^\s]+@[^\s]+', re.UNICODE)
re_transform_url = re.compile(r'(http|https)://[^\s]+', re.UNICODE)
# Different quotes are used.
re_quotes_1 = re.compile(r"(?u)(^|\W)[‘’′`']", re.UNICODE)
re_quotes_2 = re.compile(r"(?u)[‘’`′'](\W|$)", re.UNICODE)
re_quotes_3 = re.compile(r'(?u)[‘’`′“”]', re.UNICODE)
re_dots = re.compile(r'(?<!\.)\.\.(?!\.)', re.UNICODE)
re_punctuation = re.compile(r'([,";:]){2},', re.UNICODE)
re_hiphen = re.compile(r' -(?=[^\W\d_])', re.UNICODE)
re_tree_dots = re.compile(u'…', re.UNICODE)
# Differents punctuation patterns are used.
re_punkts = re.compile(r'(\w+)([%s])([ %s])' %
                       (punctuations, punctuations), re.UNICODE)
re_punkts_b = re.compile(r'([ %s])([%s])(\w+)' %
                         (punctuations, punctuations), re.UNICODE)
re_punkts_c = re.compile(r'(\w+)([%s])$' % (punctuations), re.UNICODE)
re_changehyphen = re.compile(u'–')
re_doublequotes_1 = re.compile(r'(\"\")')
re_doublequotes_2 = re.compile(r'(\'\')')
re_trim = re.compile(r' +', re.UNICODE)

In [68]:
def append_title_text(row):
    connector = '. ' if not row['title'].endswith('.') else ' '
    return row['title'].strip() + connector + row['text'].strip()

def clean_text(text):
    """Apply all regex above to a given string."""
    text = text.lower()
    text = text.replace('\xa0', ' ')
    text = re_tree_dots.sub('...', text)
    text = re.sub('\.\.\.', '', text)
    text = re_remove_brackets.sub('', text)
    text = re_changehyphen.sub('-', text)
    text = re_remove_html.sub(' ', text)
    text = re_transform_numbers.sub('0', text)
    text = re_transform_url.sub('URL', text)
    text = re_transform_emails.sub('EMAIL', text)
    text = re_quotes_1.sub(r'\1"', text)
    text = re_quotes_2.sub(r'"\1', text)
    text = re_quotes_3.sub('"', text)
    text = re.sub('"', '', text)
    text = re_dots.sub('.', text)
    text = re_punctuation.sub(r'\1', text)
    text = re_hiphen.sub(' - ', text)
    text = re_punkts.sub(r'\1 \2 \3', text)
    text = re_punkts_b.sub(r'\1 \2 \3', text)
    text = re_punkts_c.sub(r'\1 \2', text)
    text = re_doublequotes_1.sub('\"', text)
    text = re_doublequotes_2.sub('\'', text)
    text = re_trim.sub(' ', text)
    return text.strip()

def normalize_text(row):
    text = clean_text(row['text'])
    text = ' '.join([stemmer.stem(token) for token in word_tokenize(text)])
    return text

def process_original_dataset(filename, train=True):
    raw_df = pd.read_csv(filename)
    raw_df['text'] = raw_df.apply(lambda row: append_title_text(row), axis=1)
    del raw_df['title']
    if train:
        raw_df = raw_df.rename(columns={'category': 'label'})
    else:
        del raw_df['Unnamed: 0']
    return raw_df

def get_normalized_dataset(filename, train=True):
    raw_df = pd.read_csv(filename)
    raw_df['text'] = raw_df.apply(lambda row: append_title_text(row), axis=1)
    del raw_df['title']
    if train:
        raw_df = raw_df.rename(columns={'category': 'label'})
    else:
        del raw_df['Unnamed: 0']
    raw_df['text'] = raw_df.apply(lambda row: normalize_text(row), axis=1)
    return raw_df

def get_jackson_dataset(filename, train=True):
    raw_df = pd.read_csv(filename)
    raw_df['text'] = raw_df.apply(lambda row: append_title_text(row), axis=1)
    del raw_df['title']
    return raw_df

def write_predictions(predictions, out_path):
    count = 0

    with open(out_path, mode='w', encoding='utf-8') as out_file:
        print('Saving predictions for %s' % validation_data_path)
        out_file.write('id,category\n')
        idx = 0
        for result in predictions:
            count += 1
            out_file.write(str(idx) + ',' + result + '\n')
            idx += 1
            if count % 100 == 0:
                print('Predicted %d sentences' % count)
    out_file.close()
    print('Finished predicting %d sentences' % count)
    print('Results saved in %s' % Path(out_path).absolute())
    
def train_model(train_df, dev_df, submission_name):
    text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words=stopwords)), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', max_iter=1000, tol=1e-4, random_state=42))])
    text_clf_svm = text_clf_svm.fit(train_df['text'], train_df['label'])
    predicted_svm = text_clf_svm.predict(train_df['text'])
    print(balanced_accuracy_score(train_df['label'], predicted_svm))
    write_predictions(text_clf_svm.predict(dev_df['text']), 'submissions_' + submission_name + '.csv')

In [69]:
train_model(train_df, dev_df, 'text_clf_svm')

0.9942634155127885
Saving predictions for df_valid.jsonl
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Predicted 1700 sentences
Predicted 1800 sentences
Predicted 1900 sentences
Predicted 2000 sentences
Predicted 2100 sentences
Predicted 2200 sentences
Predicted 2300 sentences
Predicted 2400 sentences
Predicted 2500 sentences
Predicted 2600 sentences
Predicted 2700 sentences
Predicted 2800 sentences
Predicted 2900 sentences
Predicted 3000 sentences
Predicted 3100 sentences
Predicted 3200 sentences
Predicted 3300 sentences
Predicted 3400 sentences
Predicted 3500 sentences
Predicted 3600 sentences
Predicted 3700 sentences
Predicted 3800 sentences
Pr

In [70]:
dev_raw_df = process_original_dataset('df_valid.csv', False)
dev_raw_df

Unnamed: 0,text
0,"Vitrine de Dilma, Pronatec terá orçamento 65% ..."
1,"Por direitos autorais e publicidade, 'youtuber..."
2,Rótulos de alimentos terão que alertar sobre l...
3,Sociedade britânica de compositores processa S...
4,"Por Fies, aluna madruga na porta da FMU, mas s..."
5,Cientistas aguardam nascimento raro de 'dragõe...
6,Aplicativo mostra quem está no mesmo voo que v...
7,Pesquisas indicam que sono na direção pode ser...
8,Brasil lançará missão à Lua até 2020 para estu...
9,Cientistas testam vacina contra colesterol e d...


In [71]:
train_raw_df = process_original_dataset('df_train.csv')
train_raw_df

Unnamed: 0,text,label
0,Casa da Barra Funda tem clima roceiro e receit...,comida
1,Professores de SP decidem manter greve; grupo ...,educacao
2,"Em segunda edição, concurso paga R$ 35 mil par...",empreendedorsocial
3,Usar maconha por anos não faz tão mal para a s...,equilibrioesaude
4,Baleia-azul percorre 5.200 km e revela a cient...,ciencia
5,Líderes inovadores se reunem em rede para comp...,empreendedorsocial
6,"Conheça Pandora, atração milionária da Disney ...",turismo
7,Fiesp organiza quarta edição de maratona hacke...,empreendedorsocial
8,Praia do Forte mistura natureza com resorts es...,turismo
9,App de agência avisa quando visto e passaporte...,turismo


In [72]:
train_model(train_raw_df, dev_raw_df, 'text_clf_svm_raw')

0.9941868927089931
Saving predictions for df_valid.jsonl
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Predicted 1700 sentences
Predicted 1800 sentences
Predicted 1900 sentences
Predicted 2000 sentences
Predicted 2100 sentences
Predicted 2200 sentences
Predicted 2300 sentences
Predicted 2400 sentences
Predicted 2500 sentences
Predicted 2600 sentences
Predicted 2700 sentences
Predicted 2800 sentences
Predicted 2900 sentences
Predicted 3000 sentences
Predicted 3100 sentences
Predicted 3200 sentences
Predicted 3300 sentences
Predicted 3400 sentences
Predicted 3500 sentences
Predicted 3600 sentences
Predicted 3700 sentences
Predicted 3800 sentences
Pr

In [73]:
train_normal_df = get_normalized_dataset('df_train.csv')
dev_normal_df = get_normalized_dataset('df_valid.csv')
train_model(train_normal_df, dev_normal_df, 'text_clf_svm_normal')

0.989794802052089
Saving predictions for df_valid.jsonl
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Predicted 1700 sentences
Predicted 1800 sentences
Predicted 1900 sentences
Predicted 2000 sentences
Predicted 2100 sentences
Predicted 2200 sentences
Predicted 2300 sentences
Predicted 2400 sentences
Predicted 2500 sentences
Predicted 2600 sentences
Predicted 2700 sentences
Predicted 2800 sentences
Predicted 2900 sentences
Predicted 3000 sentences
Predicted 3100 sentences
Predicted 3200 sentences
Predicted 3300 sentences
Predicted 3400 sentences
Predicted 3500 sentences
Predicted 3600 sentences
Predicted 3700 sentences
Predicted 3800 sentences
Pre

In [74]:
train_normal_df.text[0]

'cas da barr fund tem clim roc e receit sabor . no qu qu pão 00 , cas de clim roc em uma rua tranquil da barr fund , o final é um bom começ . o café co é de um lot limit , de grã de uma microrreg min , de sol de orig vulcân , com delic sab herbáce . se é final da tard , um pão de queij robust ( r $ 0,00 o tradic ) , com dos respeit de queij , do val do jequitinhonh , em min , pod ser boa companh . se é par encerr o almoç , vai bem o brig ( r $ 0,00 ) , aquel pux , com os da infânc , com leit condens e nescau . é tent também suger uma fati do bol cas ( r $ 0,00 ) - imagin : fub com goiab ; banana-pass , açúc mascav e ave , cobert com doc de leit ; ceno , em prat de antig . meu palpit , porém , é que aind falt par que os bol sej irresist : em tod as visit , est sec . a pequen cas é embal por músic popul brasil , tem pouc mes adorn com croch e lembr o acolh da avó , reforç pel presenç de uma moring com águ da qual os client pod se serv , no minúscul quint . o tom hospital avanç par a co ,

In [75]:
col = ['label', 'text']
df = train_raw_df[col]
df['label_id'] = df['label'].factorize()[0]
label_id_df = df[['label', 'label_id']].drop_duplicates().sort_values('label_id')
label_to_id = dict(label_id_df.values)
id_to_label = dict(label_id_df[['label_id', 'label']].values)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,label,text,label_id
0,comida,Casa da Barra Funda tem clima roceiro e receit...,0
1,educacao,Professores de SP decidem manter greve; grupo ...,1
2,empreendedorsocial,"Em segunda edição, concurso paga R$ 35 mil par...",2
3,equilibrioesaude,Usar maconha por anos não faz tão mal para a s...,3
4,ciencia,Baleia-azul percorre 5.200 km e revela a cient...,4


In [76]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1, 2), stop_words=stopwords)
features = tfidf.fit_transform(df.text).toarray()
labels = df.label_id
features.shape

(7894, 52565)

In [77]:
label_to_id

{'comida': 0,
 'educacao': 1,
 'empreendedorsocial': 2,
 'equilibrioesaude': 3,
 'ciencia': 4,
 'turismo': 5,
 'sobretudo': 6,
 'tec': 7,
 'ambiente': 8}

In [78]:
N = 2
for label, label_id in sorted(label_to_id.items()):
    features_chi2 = chi2(features, labels == label_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(label))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

# 'ambiente':
  . Most correlated unigrams:
. aquecimento
. desmatamento
  . Most correlated bigrams:
. aquecimento global
. acordo paris
# 'ciencia':
  . Most correlated unigrams:
. espacial
. cientistas
  . Most correlated bigrams:
. agência espacial
. mensageiro sideral
# 'comida':
  . Most correlated unigrams:
. restaurante
. chef
  . Most correlated bigrams:
. nina horta
. tel 11
# 'educacao':
  . Most correlated unigrams:
. alunos
. ensino
  . Most correlated bigrams:
. ministério educação
. ensino médio
# 'empreendedorsocial':
  . Most correlated unigrams:
. empreendedores
. empreendedor
  . Most correlated bigrams:
. prêmio empreendedor
. empreendedor social
# 'equilibrioesaude':
  . Most correlated unigrams:
. saúde
. pacientes
  . Most correlated bigrams:
. saúde folha
. editoria saúde
# 'sobretudo':
  . Most correlated unigrams:
. apartamentos
. m²
  . Most correlated bigrams:
. eduardo sodré
. colaboração folha
# 'tec':
  . Most correlated unigrams:
. usuários
. apple
  . M

In [79]:
# for i in range(2, 21):
#     tfidf = TfidfVectorizer(sublinear_tf=True, min_df=i, ngram_range=(1, 2), stop_words=stopwords)
#     features = tfidf.fit_transform(df.text).toarray()

#     clf_2 = MultinomialNB().fit(features, df.label)
#     print('Results for minimum of %d documents: %s' % (i, balanced_accuracy_score(df['label'], clf_2.predict(features))))

In [80]:
# models = [
#     RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
#     LinearSVC(random_state=0),
#     MultinomialNB(),
#     LogisticRegression(random_state=0),
#     SGDClassifier(random_state=0)
# ]
# CV = 5
# cv_df = pd.DataFrame(index=range(CV * len(models)))
# entries = []
# for model in models:
#     model_name = model.__class__.__name__
#     accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
#     for fold_idx, accuracy in enumerate(accuracies):
#         entries.append((model_name, fold_idx, accuracy))
# cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

# sns.boxplot(x='model_name', y='accuracy', data=cv_df)
# sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
#               size=8, jitter=True, edgecolor="gray", linewidth=2)
# plt.show()

In [81]:
# cv_df.groupby('model_name').accuracy.mean()

In [82]:
# for i in range(2, 21):
#     tfidf = TfidfVectorizer(sublinear_tf=True, min_df=i, ngram_range=(1, 2), stop_words=stopwords)
#     features = tfidf.fit_transform(df.text).toarray()

#     clf_2 = LinearSVC(random_state=0).fit(features, df.label)
#     print('Results for minimum of %d documents: %s' % (i, balanced_accuracy_score(df['label'], clf_2.predict(features))))

In [83]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=2, ngram_range=(1, 2), stop_words=stopwords)
features = tfidf.fit_transform(df.text).toarray()

clf_2 = LinearSVC(random_state=0).fit(features, df.label)
balanced_accuracy_score(df['label'], clf_2.predict(features))

0.9986459917070434

In [84]:
N = 2
for label, label_id in sorted(label_to_id.items()):
    indices = np.argsort(clf_2.coef_[label_id])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 1][:N]
    bigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 2][:N]
    print("# '{}':".format(label))
    print("  . Top unigrams:\n       . {}".format('\n       . '.join(unigrams)))
    print("  . Top bigrams:\n       . {}".format('\n       . '.join(bigrams)))

# 'ambiente':
  . Top unigrams:
       . turistas
       . turismo
  . Top bigrams:
       . silvio cioffi
       . companhias aéreas
# 'ciencia':
  . Top unigrams:
       . social
       . empreendedor
  . Top bigrams:
       . empreendedor social
       . prêmio empreendedor
# 'comida':
  . Top unigrams:
       . amazônia
       . desmatamento
  . Top bigrams:
       . acordo paris
       . mudanças climáticas
# 'educacao':
  . Top unigrams:
       . cientistas
       . pesquisadores
  . Top bigrams:
       . maurício tuffani
       . darwin deus
# 'empreendedorsocial':
  . Top unigrams:
       . chef
       . restaurante
  . Top bigrams:
       . nina horta
       . tel 11
# 'equilibrioesaude':
  . Top unigrams:
       . educação
       . ensino
  . Top bigrams:
       . ensino médio
       . ministério educação
# 'sobretudo':
  . Top unigrams:
       . paulo
       . colaboração
  . Top bigrams:
       . colaboração folha
       . eduardo sodré
# 'tec':
  . Top unigrams:
       . a

In [85]:
train_normal_df = get_jackson_dataset('df_jackson/df_train_jackson.csv')
dev_normal_df = get_jackson_dataset('df_jackson/df_valid_jackson.csv')

In [86]:
text_clf_svc = Pipeline([('tfidf_vect', TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words=stopwords)), 
                         ('clf-svc', LinearSVC(random_state=0, max_iter=3000, tol=1e-6))])
text_clf_svc = text_clf_svc.fit(train_normal_df['text'], train_normal_df['category'])
predicted_svc = text_clf_svc.predict(train_normal_df['text'])
balanced_accuracy_score(train_normal_df['category'], predicted_svc)

0.9986459917070434

In [87]:
write_predictions(text_clf_svc.predict(dev_normal_df['text']), 'submissions_jackson.csv')

Saving predictions for df_valid.jsonl
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Predicted 1700 sentences
Predicted 1800 sentences
Predicted 1900 sentences
Predicted 2000 sentences
Predicted 2100 sentences
Predicted 2200 sentences
Predicted 2300 sentences
Predicted 2400 sentences
Predicted 2500 sentences
Predicted 2600 sentences
Predicted 2700 sentences
Predicted 2800 sentences
Predicted 2900 sentences
Predicted 3000 sentences
Predicted 3100 sentences
Predicted 3200 sentences
Predicted 3300 sentences
Predicted 3400 sentences
Predicted 3500 sentences
Predicted 3600 sentences
Predicted 3700 sentences
Predicted 3800 sentences
Predicted 3900 senten

In [89]:
f1_score(train_normal_df['category'], predicted_svc, average='micro')

0.9991132505700532

In [None]:
text_clf_svc

In [None]:
write_predictions(text_clf_svc.predict(dev_normal_df['text']), 'submissions_normal_linear_svc.csv')

In [None]:
text_clf_svc.get_params()['clf-svc'].coef_.shape

In [None]:
N = 2
for label, label_id in sorted(label_to_id.items()):
    indices = np.argsort(text_clf_svc.get_params()['clf-svc'].coef_[label_id])
    feature_names = np.array(text_clf_svc.get_params()['tfidf_vect'].get_feature_names())[indices]
    unigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 1][:N]
    bigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 2][:N]
    print("# '{}':".format(label))
    print("  . Top unigrams:\n       . {}".format('\n       . '.join(unigrams)))
    print("  . Top bigrams:\n       . {}".format('\n       . '.join(bigrams)))

In [None]:
text_clf_svc = Pipeline([('tfidf_vect', TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words=stopwords)), ('clf-svc', LinearSVC(random_state=0))])

In [None]:
text_clf_svc = text_clf_svc.fit(train_df['text'], train_df['label'])
predicted_svc = text_clf_svc.predict(train_df['text'])
balanced_accuracy_score(train_df['label'], predicted_svc)

In [None]:
write_predictions(text_clf_svc.predict(dev_df['text']), 'submissions_linear_svc.csv')

In [None]:
N = 2
for text, label in sorted(l.items()):
    indices = np.argsort(text_clf_svc.coef_[category_id])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 1][:N]
    bigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 2][:N]
    print("# '{}':".format(Product))
    print("  . Top unigrams:\n       . {}".format('\n       . '.join(unigrams)))
    táprint("  . Top bigrams:\n       . {}".format('\n       . '.join(bigrams)))

In [None]:
tfidf_3 = 
features_3 = tfidf_3.fit_transform(df.text).toarray()

clf_3 = LinearSVC(random_state=0).fit(features_3, df.label)
balanced_accuracy_score(df['label'], clf_3.predict(features_3))

In [None]:
features_3.shape

In [None]:
dev_features = tfidf_3.fit(dev_df['text'])
dev_features.shape

In [None]:
write_predictions(clf_3.predict(dev_features), 'submissions_linear_svc.csv')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [None]:
X_train_tfidf.shape

In [None]:
df_count = count_vect.transform(df['text'])

In [None]:
df_count.shape

In [None]:
train_raw_df.to_csv('train_text_label.csv', index=None)

In [90]:
dev_raw_df.to_csv('dev_text.csv', index=None)