In [59]:
import pandas as pd
import numpy as np

### Python and str

[check by yourself](https://www.programiz.com/python-programming/methods/string)
- slicing 
- listing
- \[::-1\]
- join
- split
- lower
- upper
- count
- startswith
- edswith
- formating 
- strip 
- encoding 
- u, r, b, f 

### Tokenization

In [11]:
import nltk
import re
text = "I want to study NLP's techniques, isn't it?"

In [12]:
tokenizer = nltk.tokenize.WhitespaceTokenizer()
tokenizer.tokenize(text)

['I', 'want', 'to', 'study', "NLP's", 'techniques,', "isn't", 'it?']

In [13]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()
tokenizer.tokenize(text)

['I',
 'want',
 'to',
 'study',
 'NLP',
 "'s",
 'techniques',
 ',',
 'is',
 "n't",
 'it',
 '?']

In [15]:
tokenizer = nltk.tokenize.WordPunctTokenizer()
tokenizer.tokenize(text)

['I',
 'want',
 'to',
 'study',
 'NLP',
 "'",
 's',
 'techniques',
 ',',
 'isn',
 "'",
 't',
 'it',
 '?']

### Стемминг и Лемматизация

In [16]:
text = "feet wolves cats talked"
tokenizer = nltk.tokenize.TreebankWordTokenizer()
tokens = tokenizer.tokenize(text)

In [17]:
stemmer = nltk.stem.PorterStemmer()
" ".join(stemmer.stem(token) for token in tokens)

'feet wolv cat talk'

In [18]:
stemmer = nltk.stem.WordNetLemmatizer()
" ".join(stemmer.lemmatize(token) for token in tokens)

'foot wolf cat talked'

### stopwords

In [58]:
nltk.download('stopwords')
from nltk.corpus import stopwords

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/serge/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### BOW

Pipline:
- find unique words
- enumerate it, map with dictionary
- create zero vectors 
- fill the values inside loop

In [None]:
### code (example1)

### TF_IDF

TF термина а = (Количество раз, когда термин а встретился в тексте / количество всех слов в тексте)

IDF термина а = логарифм(Общее количество документов / Количество документов, в которых встречается термин а)

In [None]:
### code (example2)

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
   'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
 ]
vectorizer = TfidfVectorizer()

# extra params 

# ngram_range=(1,2) N_GRAMS

# min_df = 0.01 means "ignore terms that appear in less than 1% of the documents".
# min_df = 5 means "ignore terms that appear in less than 5 documents".


# max_df = 0.50 means "ignore terms that appear in more than 50% of the documents".
# max_df = 25 means "ignore terms that appear in more than 25 documents".

X = vectorizer.fit_transform(corpus)

In [63]:
pd.DataFrame(X.todense(), columns= vectorizer.get_feature_names())

Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085
1,0.0,0.687624,0.0,0.281089,0.0,0.538648,0.281089,0.0,0.281089
2,0.511849,0.0,0.0,0.267104,0.511849,0.0,0.267104,0.511849,0.267104
3,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085


### Hashing 

In [2]:
import hashlib
hash_object = hashlib.md5(b'Hello World')
print(hash_object.hexdigest())`

b10a8db164e0754105b7a99be72e3fe5


In [31]:
import hashlib # sha224, sha256, 384, 512, 1
hash_object = hashlib.sha1(b'Hello World')
hex_dig = hash_object.hexdigest()
print(hex_dig)
len(str(int(hex_dig, 32)))

0a4d55a8d778e5022fab701977c5d840bbc486d0


59

In [30]:
from sklearn.feature_extraction import FeatureHasher
h = FeatureHasher(n_features=5, input_type='string')
f = h.transform(['hello world', 'hell'])
f.toarray()

array([[ 0., -1.,  4.,  2.,  2.],
       [-1.,  0.,  2.,  0.,  1.]])

### Gensim

ДЗ: проделать все то же самое с gensim (кроме hashing)

[Инструкция](https://radimrehurek.com/gensim/tut1.html#from-strings-to-vectors)

### Optional

In [66]:
from pymystem3 import Mystem

text = "Красивая мама красиво мыла раму"
m = Mystem() # lemmatize, analyse
lemmas = m.lemmatize(text) 

Installing mystem to /home/serge/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.1-linux-64bit.tar.gz


In [69]:
m.analyze(text)[0]

{'analysis': [{'lex': 'красивый', 'wt': 1, 'gr': 'A=им,ед,полн,жен'}],
 'text': 'Красивая'}