In [27]:
import pandas as pd
import numpy  as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import naive_bayes
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import spacy
import re

In [28]:
data_origin = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')

In [29]:
vect = CountVectorizer(min_df=5, stop_words="english").fit(data_origin.description)

In [30]:
x_train = vect.transform(data_origin.description)

In [12]:
print("x_train:\n {}".format(repr(x_train)))

x_train:
 <2931x1063 sparse matrix of type '<class 'numpy.int64'>'
	with 24089 stored elements in Compressed Sparse Row format>


In [13]:
feature_names = vect.get_feature_names()
print("Number of features: {}".format(len(feature_names)))
print("First 20 features:\n{}".format(feature_names[:20]))
print("Features 2000 to 2020:\n{}".format(feature_names[500:520]))
print("Every 200th features:\n{}".format(feature_names[::200]))

Number of features: 1063
First 20 features:
['ability', 'able', 'abreast', 'acceptance', 'access', 'accordance', 'according', 'account', 'accuracy', 'accurate', 'accurately', 'achieve', 'acquisition', 'act', 'action', 'actionable', 'actions', 'active', 'actively', 'activities']
Features 2000 to 2020:
['inspection', 'inspections', 'installation', 'integrate', 'integrated', 'integrates', 'integrating', 'integration', 'integrity', 'intelligence', 'interact', 'interaction', 'interactions', 'interface', 'interfaces', 'internal', 'interpret', 'investigate', 'investigations', 'issue']
Every 200th features:
['ability', 'consulting', 'fine', 'meets', 'regarding', 'translating']


In [20]:
cv = CountVectorizer(ngram_range=(1,1)).fit(data_origin.description)
print("Vocabulary size:{}".format(len(cv.vocabulary_)))
print("Vocabulary:\n{}".format(cv.get_feature_names()))

Vocabulary size:4324
Vocabulary:
['000', '10', '11', '20', '2003', '2008', '2012', '2013', '2016', '24', '25', '260', '2d', '30', '32', '36', '365', '3d', '3rd', '40', '50', '500', '500m', '523', '5g', '64', '70', 'ab', 'abilities', 'ability', 'able', 'about', 'above', 'abreast', 'abroad', 'absent', 'abstracting', 'abstractions', 'academic', 'acc', 'accelerate', 'acceleration', 'accelerator', 'acceptable', 'acceptance', 'accepted', 'access', 'accessibility', 'accessible', 'accessories', 'accident', 'accommodating', 'accompany', 'accomplish', 'accomplished', 'accomplishes', 'accomplishing', 'accordance', 'according', 'accordingly', 'account', 'accountability', 'accountable', 'accountants', 'accounting', 'accounts', 'accuracy', 'accurate', 'accurately', 'achieve', 'achievement', 'achieving', 'acknowledgment', 'acoustic', 'acquire', 'acquires', 'acquisition', 'acquisitions', 'across', 'act', 'acting', 'action', 'actionable', 'actions', 'activates', 'activation', 'active', 'actively', 'act

In [21]:
cv = CountVectorizer(ngram_range=(2,2)).fit(data_origin.description)
print("Vocabulary size:{}".format(len(cv.vocabulary_)))
print("Vocabulary:\n{}".format(cv.get_feature_names()))

Vocabulary size:22944
Vocabulary:
['000 computing', '10 cm', '10 of', '20 overnight', '20 staff', '2003 2013', '2008 2012', '2012 2016', '2013 office', '24 global', '25 backend', '25 middle', '25 pounds', '25 travel', '260 pci', '2d and', '30 of', '32 64', '32 bit', '36 523', '365 crm', '365 platforms', '3d engineering', '3d seismic', '3d visualizations', '3rd party', '40 50', '50 front', '50 travel', '500 000', '500m records', '523 cr', '5g fdd', '5g ue', '64 bit', '70 large', 'ab beckhoff', 'ab sew', 'ab siemens', 'abilities for', 'abilities of', 'ability and', 'ability in', 'ability of', 'ability to', 'ability tomonitor', 'ability tomulti', 'able to', 'about and', 'about customer', 'about how', 'about incoming', 'about machine', 'about new', 'about product', 'about progress', 'about the', 'about their', 'about workflows', 'above customers', 'above in', 'above studies', 'above to', 'abreast of', 'abstracting and', 'abstractions and', 'academic and', 'academic community', 'academic in

In [38]:
regexp = re.compile('(?u)\\b\\w\\w+\\b')

In [43]:
en_nlp = spacy.load('en')
old_tokenizer = en_nlp.tokenizer
en_nlp.tokenizer = lambda string: old_tokenizer.tokens_from_list(regexp.findall(string))

In [44]:
def custom_tokenizer(document):
    doc_spacy = en_nlp(document)
    return [token.lemma_ for token in doc_spacy]

In [57]:
lemma_vect = CountVectorizer(tokenizer=custom_tokenizer,min_df=5,stop_words="english",ngram_range=(1,3))

In [58]:
x_train_lemma = lemma_vect.fit_transform(data_origin.description)

from spacy.tokens import Doc
doc = Doc(nlp.vocab, words=[...])
  This is separate from the ipykernel package so we can avoid doing imports until


In [59]:
print("x_train_lemma.shape:{}".format(x_train_lemma.shape))

x_train_lemma.shape:(2931, 1382)


In [60]:
vect = CountVectorizer(min_df=5,stop_words="english",ngram_range=(1,3)).fit(data_origin.description)
x_train = vect.transform(data_origin.description)
print("x_train.shape:{}".format(x_train.shape))

x_train.shape:(2931, 1431)
