In [65]:
import nltk
import re
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [66]:
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [67]:
text = """A major drawback of statistical methods is that they require elaborate feature engineering.
Since the early 2010s, the field has thus largely abandoned statistical methods and shifted to neural networks for machine learning.
Popular techniques include the use of word embeddings to capture semantic properties of words, and an increase in end-to-end learning of a higher-level task (e.g., question answering) instead of relying on a pipeline of separate intermediate tasks (e.g., part-of-speech tagging and dependency parsing).
In some areas, this shift has entailed substantial changes in how NLP systems are designed, such that deep neural network-based approaches may be viewed as a new paradigm distinct from statistical natural language processing.
For instance, the term neural machine translation (NMT) emphasizes the fact that deep learning-based approaches to machine translation directly learn sequence-to-sequence transformations, obviating the need for intermediate steps such as word alignment and language modeling that was used in statistical machine translation (SMT).
Latest works tend to use non-technical structure of a given task to build proper neural network
"""

In [72]:
sentences =sent_tokenize(text)
stop_words = set(stopwords.words('english'))

corpus = []
stemmer = PorterStemmer()


for token in sentences :
  review = re.sub('[^a-zA-Z]',' ',token)
  review = re.sub("\b[a-zA-Z]\b", " ", review)
  review = review.lower()
  review = word_tokenize(review)
  review = [token for token in review if token not in stop_words]
  review = [stemmer.stem(token) for token in review]
  review = " ".join(review)
  corpus.append(review)


['major drawback statist method requir elabor featur engin']
['major drawback statist method requir elabor featur engin', 'sinc earli field thu larg abandon statist method shift neural network machin learn']
['major drawback statist method requir elabor featur engin', 'sinc earli field thu larg abandon statist method shift neural network machin learn', 'popular techniqu includ use word embed captur semant properti word increas end end learn higher level task e g question answer instead reli pipelin separ intermedi task e g part speech tag depend pars']
['major drawback statist method requir elabor featur engin', 'sinc earli field thu larg abandon statist method shift neural network machin learn', 'popular techniqu includ use word embed captur semant properti word increas end end learn higher level task e g question answer instead reli pipelin separ intermedi task e g part speech tag depend pars', 'area shift entail substanti chang nlp system design deep neural network base approach may

vectorization

In [73]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
bow = cv.fit_transform(corpus).toarray()

In [74]:
bow

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0,
        0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 0, 0, 0, 1, 0, 2, 0],
       [0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 

In [75]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
tfidf = tf.fit_transform(corpus).toarray()

In [76]:
tfidf

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.37730799,
        0.        , 0.37730799, 0.        , 0.        , 0.        ,
        0.37730799, 0.        , 0.        , 0.37730799, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.37730799, 0.        ,
        0.30939795, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.37730799, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.22384142,
        0.        , 0.        , 0.        , 0.  