In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
nlp = spacy.load('pt')


df = pd.read_csv('enunciados_novos.csv')
df.head()

Unnamed: 0,Id,Enunciado,Tópico,Contexto
0,994,Meu primeiro programa\nEscreva um programa que...,Ambientação,Outros
1,996,Impressão de caracteres na tela\nEscreva um pr...,Ambientação,Outros
2,1326,Adição\nSosígenes e Jocasta foram a um restaur...,Ambientação,Comercial
3,1327,Subtração\nHeráclito e Fredegunda foram a um r...,Ambientação,Comercial
4,1328,Multiplicação\nVinte amigos foram a um rodízio...,Ambientação,Comercial


In [2]:
df['Tópico'].value_counts()

Estrutura condicional encadeada        55
Estrutura de repetição por contagem    52
Estrutura condicional composta         44
Estrutura de repetição por condição    43
Vetor                                  41
Matrizes                               36
Operadores aritmétricos                29
Strings                                20
Ambientação                            16
Estrutura sequencial                   12
Name: Tópico, dtype: int64

In [3]:
df['Contexto'].value_counts()

Matemático         94
Jogos              40
Comercial          40
Escolar            30
Outros             23
Física             14
Esporte            13
Pessoa             11
Bancário           10
Data e hora        10
Trânsito            9
RH                  8
Português           7
Filmes e séries     7
Geografia           6
População           6
Consumo             5
Saúde               4
Computacional       3
Segurança           3
Pesquisa            2
Imposto             2
Loteria             1
Name: Contexto, dtype: int64

In [4]:
from sklearn.model_selection import train_test_split
X = df['Enunciado']
y = df['Contexto']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import metrics

In [5]:
class LemmaTokenizer(object):
    def __call__(self, text):
        doc=nlp(text)
        tokens=[]
        for token in doc:
            if(token.text.isalpha() and len(token.text)>=2):
                tokens+=[token]
        return [t.lemma_ for t in tokens if t not in nlp.Defaults.stop_words]
pipeline= LinearSVC(
    penalty='l2',
    loss='squared_hinge',
    dual=True,
    tol=0.001,
    C=10.0,
    multi_class='ovr',
    fit_intercept=True,
    intercept_scaling=10,
    class_weight=None,
    verbose=0,
    random_state=None,
    max_iter=10000,
)
# RUN THIS CELL TO ADD STOPWORDS TO THE LINEAR SVC PIPELINE:
text_clf_lsvc2 = Pipeline([('tfidf', TfidfVectorizer(stop_words=nlp.Defaults.stop_words,
                            tokenizer=LemmaTokenizer())),
                     ('clf',pipeline),
])
text_clf_lsvc2.fit(X_train, y_train)

  'stop_words.' % sorted(inconsistent))


Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words={'acerca', 'ademais', 'adeus',
                                             'agora', 'ainda', 'algo',
                                             'alguma...
                                 strip_accents=None, sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<__main__.LemmaTokenizer object at 0x7f501c108cc0>,
                                 use_id

In [6]:
predictions = text_clf_lsvc2.predict(X_test)
print(metrics.confusion_matrix(y_test,predictions))

[[ 3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 14  0  0  0  0  0  0  1  1  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  3  0  0  0  0  0  0  1  0  0  0  1  0  0  0]
 [ 0  0  0  0  7  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  6  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  5  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 11  1  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 30  1  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  1  5  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  1  0  1  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  3  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0]
 [ 0  0  1  0  0  0  0  0  0  0

In [7]:
print(metrics.classification_report(y_test,predictions))

               precision    recall  f1-score   support

     Bancário       1.00      1.00      1.00         3
    Comercial       1.00      0.88      0.93        16
Computacional       0.00      0.00      0.00         0
  Data e hora       1.00      0.60      0.75         5
      Escolar       1.00      1.00      1.00         7
      Esporte       1.00      1.00      1.00         6
       Física       1.00      1.00      1.00         5
    Geografia       1.00      1.00      1.00         1
        Jogos       0.85      0.92      0.88        12
   Matemático       0.88      0.97      0.92        31
       Outros       0.71      0.83      0.77         6
       Pessoa       1.00      0.50      0.67         2
    População       0.00      0.00      0.00         1
    Português       1.00      1.00      1.00         3
           RH       0.67      1.00      0.80         2
        Saúde       1.00      1.00      1.00         1
    Segurança       1.00      1.00      1.00         1
     Trân

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [8]:
print(metrics.accuracy_score(y_test,predictions))

0.9047619047619048


In [9]:
X1 = df['Enunciado']
y = df['Contexto']
vect= TfidfVectorizer(stop_words=nlp.Defaults.stop_words,tokenizer=LemmaTokenizer())
X=vect.fit_transform(X1)
from sklearn.model_selection import cross_val_score
cv_score = cross_val_score(pipeline, X, y, scoring='accuracy', cv=10)

  'stop_words.' % sorted(inconsistent))


In [10]:
print('Acc.: %.2f [+/-%.2f]' % (cv_score.mean(), cv_score.std()))

Acc.: 0.86 [+/-0.07]


In [11]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
y_pred=cross_val_predict(pipeline,X,y,cv=10)
print(confusion_matrix(y,y_pred))



[[ 9  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0]
 [ 0 36  0  1  0  0  0  0  0  0  0  1  0  2  0  0  0  0  0  0  0  0  0]
 [ 0  0  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0]
 [ 0  1  0  4  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  7  0  0  0  0  0  0  1  0  0  1  0  0  0  0  1  0  0  0]
 [ 0  0  0  0  0 28  0  0  0  0  0  0  0  2  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 13  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  6  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 13  0  0  0  0  1  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  6  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  2  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  1  0  0 38  0  1  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  1  0 90  2  0  1  0  0  0  0

In [12]:
print(classification_report(y,y_pred))

                 precision    recall  f1-score   support

       Bancário       1.00      0.90      0.95        10
      Comercial       0.92      0.90      0.91        40
  Computacional       0.67      0.67      0.67         3
        Consumo       0.80      0.80      0.80         5
    Data e hora       0.64      0.70      0.67        10
        Escolar       0.90      0.93      0.92        30
        Esporte       1.00      1.00      1.00        13
Filmes e séries       1.00      0.86      0.92         7
         Física       0.87      0.93      0.90        14
      Geografia       1.00      1.00      1.00         6
        Imposto       1.00      1.00      1.00         2
          Jogos       0.86      0.95      0.90        40
        Loteria       0.00      0.00      0.00         1
     Matemático       0.80      0.96      0.87        94
         Outros       0.71      0.52      0.60        23
       Pesquisa       0.00      0.00      0.00         2
         Pessoa       0.88    

  'precision', 'predicted', average, warn_for)


In [13]:
X = df['Enunciado']
y = df['Tópico']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [14]:
text_clf_lsvc2 = Pipeline([('tfidf', TfidfVectorizer(stop_words=nlp.Defaults.stop_words,
                            tokenizer=LemmaTokenizer())),
                     ('clf',pipeline),
])
text_clf_lsvc2.fit(X_train, y_train)

  'stop_words.' % sorted(inconsistent))


Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words={'acerca', 'ademais', 'adeus',
                                             'agora', 'ainda', 'algo',
                                             'alguma...
                                 strip_accents=None, sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<__main__.LemmaTokenizer object at 0x7f501a66d9b0>,
                                 use_id

In [15]:
predictions = text_clf_lsvc2.predict(X_test)
print(metrics.confusion_matrix(y_test,predictions))

[[ 3  0  0  1  0  0  0  0  0  0]
 [ 0  9  1  0  1  1  0  2  0  1]
 [ 0  2 13  0  1  0  1  1  0  0]
 [ 0  1  0 15  0  0  0  0  0  0]
 [ 0  0  1  0  7  0  0  0  2  2]
 [ 0  3  0  0  0  1  0  0  0  0]
 [ 0  0  0  0  0  0 11  1  0  0]
 [ 0  3  1  3  0  0  0  3  0  0]
 [ 0  1  0  0  3  0  1  0  3  0]
 [ 0  0  0  0  1  0  0  0  0  5]]


In [16]:
print(metrics.accuracy_score(y_test,predictions))

0.6666666666666666


In [17]:
X1 = df['Enunciado']
y = df['Tópico']
vect= TfidfVectorizer(stop_words=nlp.Defaults.stop_words,tokenizer=LemmaTokenizer())
X=vect.fit_transform(X1)
cv_score = cross_val_score(pipeline, X, y, scoring='accuracy', cv=10)

  'stop_words.' % sorted(inconsistent))


In [18]:
print('Acc.: %.2f [+/-%.2f]' % (cv_score.mean(), cv_score.std()))

Acc.: 0.65 [+/-0.13]


In [19]:
y_pred=cross_val_predict(pipeline,X,y,cv=10)
print(confusion_matrix(y,y_pred))



[[ 8  2  1  1  1  0  0  2  1  0]
 [ 0 22  6  1  1  4  2  7  1  0]
 [ 0  6 46  0  1  0  1  1  0  0]
 [ 0  1  1 33  0  0  1  7  0  0]
 [ 0  2  3  1 33  0  3  0  3  7]
 [ 0  8  0  1  0  2  0  1  0  0]
 [ 0  1  2  1  0  1 30  0  1  0]
 [ 0  6  2  2  1  2  0 14  0  2]
 [ 2  1  1  2  4  0  2  0  5  3]
 [ 1  1  1  0  6  0  0  0  2 30]]


In [20]:
print(classification_report(y,y_pred))

                                     precision    recall  f1-score   support

                        Ambientação       0.73      0.50      0.59        16
     Estrutura condicional composta       0.44      0.50      0.47        44
    Estrutura condicional encadeada       0.73      0.84      0.78        55
Estrutura de repetição por condição       0.79      0.77      0.78        43
Estrutura de repetição por contagem       0.70      0.63      0.67        52
               Estrutura sequencial       0.22      0.17      0.19        12
                           Matrizes       0.77      0.83      0.80        36
            Operadores aritmétricos       0.44      0.48      0.46        29
                            Strings       0.38      0.25      0.30        20
                              Vetor       0.71      0.73      0.72        41

                           accuracy                           0.64       348
                          macro avg       0.59      0.57      0.58       3