In [20]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

import string
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [5]:
paragraph = '''Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data. The result is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.'''

documents = nltk.sent_tokenize(paragraph)
for doc in documents:
    print(f'{doc}\n')

Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.

The result is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them.

The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.



In [6]:
word_tokens = [nltk.word_tokenize(doc) for doc in documents] 
print(word_tokens)

[['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'subfield', 'of', 'linguistics', ',', 'computer', 'science', ',', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', 'language', ',', 'in', 'particular', 'how', 'to', 'program', 'computers', 'to', 'process', 'and', 'analyze', 'large', 'amounts', 'of', 'natural', 'language', 'data', '.'], ['The', 'result', 'is', 'a', 'computer', 'capable', 'of', '``', 'understanding', "''", 'the', 'contents', 'of', 'documents', ',', 'including', 'the', 'contextual', 'nuances', 'of', 'the', 'language', 'within', 'them', '.'], ['The', 'technology', 'can', 'then', 'accurately', 'extract', 'information', 'and', 'insights', 'contained', 'in', 'the', 'documents', 'as', 'well', 'as', 'categorize', 'and', 'organize', 'the', 'documents', 'themselves', '.']]


In [7]:
filtered_tokens = []
regex_str =  '^[\W_]+$'
stop_words = set(stopwords.words('english'))

for words in word_tokens:
    new_doc = [word for word in words if word not in stop_words and not re.match(regex_str, word)]
    print(f'{new_doc}\n')
    filtered_tokens.append(new_doc)

['Natural', 'language', 'processing', 'NLP', 'subfield', 'linguistics', 'computer', 'science', 'artificial', 'intelligence', 'concerned', 'interactions', 'computers', 'human', 'language', 'particular', 'program', 'computers', 'process', 'analyze', 'large', 'amounts', 'natural', 'language', 'data']

['The', 'result', 'computer', 'capable', 'understanding', 'contents', 'documents', 'including', 'contextual', 'nuances', 'language', 'within']

['The', 'technology', 'accurately', 'extract', 'information', 'insights', 'contained', 'documents', 'well', 'categorize', 'organize', 'documents']



In [8]:
stemmer = PorterStemmer()
print('Stemming result \n')

for doc in filtered_tokens:
    new_doc = [stemmer.stem(word) for word in doc]
    print(f'{new_doc}\n')

Stemming result 

['natur', 'languag', 'process', 'nlp', 'subfield', 'linguist', 'comput', 'scienc', 'artifici', 'intellig', 'concern', 'interact', 'comput', 'human', 'languag', 'particular', 'program', 'comput', 'process', 'analyz', 'larg', 'amount', 'natur', 'languag', 'data']

['the', 'result', 'comput', 'capabl', 'understand', 'content', 'document', 'includ', 'contextu', 'nuanc', 'languag', 'within']

['the', 'technolog', 'accur', 'extract', 'inform', 'insight', 'contain', 'document', 'well', 'categor', 'organ', 'document']



In [9]:
lemmatizer = WordNetLemmatizer()
print('Lemmatizer result \n')

for doc in filtered_tokens:
    new_doc = [lemmatizer.lemmatize(word) for word in doc]
    print(f'{new_doc}\n')

Lemmatizer result 

['Natural', 'language', 'processing', 'NLP', 'subfield', 'linguistics', 'computer', 'science', 'artificial', 'intelligence', 'concerned', 'interaction', 'computer', 'human', 'language', 'particular', 'program', 'computer', 'process', 'analyze', 'large', 'amount', 'natural', 'language', 'data']

['The', 'result', 'computer', 'capable', 'understanding', 'content', 'document', 'including', 'contextual', 'nuance', 'language', 'within']

['The', 'technology', 'accurately', 'extract', 'information', 'insight', 'contained', 'document', 'well', 'categorize', 'organize', 'document']



In [10]:
corpus = []
print('transformed data.. \n')

for doc in documents:
    data = re.sub('[^a-zA-Z]', ' ', doc)
    data = data.lower()
    data = data.split()
    data = [lemmatizer.lemmatize(word) for word in data if word not in stop_words]
    
    new_doc = ' '.join(data)
    print(f'{new_doc} \n')
    corpus.append(new_doc)

transformed data.. 

natural language processing nlp subfield linguistics computer science artificial intelligence concerned interaction computer human language particular program computer process analyze large amount natural language data 

result computer capable understanding content document including contextual nuance language within 

technology accurately extract information insight contained document well categorize organize document 



In [11]:
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
X

array([[0, 1, 1, 1, 0, 0, 3, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 3, 1,
        1, 2, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1],
       [1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 2, 1, 0, 0, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0]])

In [12]:
print(cv.get_feature_names())

['accurately', 'amount', 'analyze', 'artificial', 'capable', 'categorize', 'computer', 'concerned', 'contained', 'content', 'contextual', 'data', 'document', 'extract', 'human', 'including', 'information', 'insight', 'intelligence', 'interaction', 'language', 'large', 'linguistics', 'natural', 'nlp', 'nuance', 'organize', 'particular', 'process', 'processing', 'program', 'result', 'science', 'subfield', 'technology', 'understanding', 'well', 'within']


In [18]:
cv_ngram = CountVectorizer(analyzer='word', ngram_range=(2, 3))
X = cv_ngram.fit_transform(corpus).toarray()
X

array([[0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 0, 0, 1, 1,
        1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1]])

In [19]:
print(cv_ngram.get_feature_names())

['accurately extract', 'accurately extract information', 'amount natural', 'amount natural language', 'analyze large', 'analyze large amount', 'artificial intelligence', 'artificial intelligence concerned', 'capable understanding', 'capable understanding content', 'categorize organize', 'categorize organize document', 'computer capable', 'computer capable understanding', 'computer human', 'computer human language', 'computer process', 'computer process analyze', 'computer science', 'computer science artificial', 'concerned interaction', 'concerned interaction computer', 'contained document', 'contained document well', 'content document', 'content document including', 'contextual nuance', 'contextual nuance language', 'document including', 'document including contextual', 'document well', 'document well categorize', 'extract information', 'extract information insight', 'human language', 'human language particular', 'including contextual', 'including contextual nuance', 'information insi

In [22]:
tfidfvec = TfidfVectorizer()
X = tfidfvec.fit_transform(corpus).toarray()
X

array([[0.        , 0.17842586, 0.17842586, 0.17842586, 0.        ,
        0.        , 0.4070924 , 0.17842586, 0.        , 0.        ,
        0.        , 0.17842586, 0.        , 0.        , 0.17842586,
        0.        , 0.        , 0.        , 0.17842586, 0.17842586,
        0.4070924 , 0.17842586, 0.17842586, 0.35685172, 0.17842586,
        0.        , 0.        , 0.17842586, 0.17842586, 0.17842586,
        0.17842586, 0.        , 0.17842586, 0.17842586, 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.32049968,
        0.        , 0.24374827, 0.        , 0.        , 0.32049968,
        0.32049968, 0.        , 0.24374827, 0.        , 0.        ,
        0.32049968, 0.        , 0.        , 0.        , 0.        ,
        0.24374827, 0.        , 0.        , 0.        , 0.        ,
        0.32049968, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.32049968, 0.        , 0.        , 0.        ,
   

In [23]:
print(tfidfvec.get_feature_names())

['accurately', 'amount', 'analyze', 'artificial', 'capable', 'categorize', 'computer', 'concerned', 'contained', 'content', 'contextual', 'data', 'document', 'extract', 'human', 'including', 'information', 'insight', 'intelligence', 'interaction', 'language', 'large', 'linguistics', 'natural', 'nlp', 'nuance', 'organize', 'particular', 'process', 'processing', 'program', 'result', 'science', 'subfield', 'technology', 'understanding', 'well', 'within']
