# How to tokenize a text in python

In [22]:
text1 = "Here’s to the crazy ones, the misfits, the rebels, the troublemakers, the round pegs in the square holes. The ones who see things differently — they’re not fond of rules. You can quote them, disagree with them, glorify or vilify them, but the only thing you can’t do is ignore them because they change things. They push the human race forward, and while some may see them as the crazy ones, we see genius, because the ones who are crazy enough to think that they can change the world, are the ones who do"
text2 = "I choose a lazy person to do a hard job. Because a lazy person will find an easy way to do it."
text_spanish = """Por los locos. Los marginados. Los rebeldes. Los problematicos. 
Los inadaptados. Los que ven las cosas de una manera distinta. A los que no les gustan
las reglas. Y a los que no respetan el “status quo”. Puedes citarlos, discrepar de ellos,
ensalzarlos o vilipendiarlos. Pero lo que no puedes hacer es ignorarlos… Porque ellos
cambian las cosas, empujan hacia adelante la raza humana y, aunque algunos puedan
considerarlos locos, nosotros vemos en ellos a genios. Porque las personas que están
lo bastante locas como para creer que pueden cambiar el mundo, son las que lo logran."""

author1 = "jobs"
author2 = "gates"

##  The most simple way: split

In [6]:
tokens = text1.split()
tokens[:10]

['Here’s',
 'to',
 'the',
 'crazy',
 'ones,',
 'the',
 'misfits,',
 'the',
 'rebels,',
 'the']

##  NLTK

In [8]:
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text1)
tokens[:10]

['Here', '’', 's', 'to', 'the', 'crazy', 'ones', ',', 'the', 'misfits']

## sklearn

In [19]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

cv = CountVectorizer()
tokens = cv.fit_transform([text1])
pd.DataFrame.sparse.from_spmatrix(tokens, columns=cv.get_feature_names())

Unnamed: 0,and,are,as,because,but,can,change,crazy,differently,disagree,...,think,to,troublemakers,vilify,we,while,who,with,world,you
0,1,2,1,2,1,3,2,3,1,1,...,1,2,1,1,1,1,3,1,1,2


In [20]:
tfidf = TfidfVectorizer()
tf_tokens = tfidf.fit_transform([text1])
pd.DataFrame.sparse.from_spmatrix(tf_tokens, columns=tfidf.get_feature_names())

Unnamed: 0,and,are,as,because,but,can,change,crazy,differently,disagree,...,think,to,troublemakers,vilify,we,while,who,with,world,you
0,0.054233,0.108465,0.054233,0.108465,0.054233,0.162698,0.108465,0.162698,0.054233,0.054233,...,0.054233,0.108465,0.054233,0.054233,0.054233,0.054233,0.162698,0.054233,0.054233,0.108465


In [21]:
df = pd.DataFrame({'author':[author1, author2], 'text':[text1, text2]})
cv = CountVectorizer(stop_words='english')
tokens = cv.fit_transform(df['text'])
pd.DataFrame.sparse.from_spmatrix(tokens, index=df.index, columns=cv.get_feature_names())

Unnamed: 0,change,choose,crazy,differently,disagree,easy,fond,forward,genius,glorify,...,round,rules,square,thing,things,think,troublemakers,vilify,way,world
0,2,0,3,1,1,0,1,1,1,1,...,1,1,1,1,2,1,1,1,0,1
1,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


## spaCy

In [26]:
#!pip install spacy

In [29]:
from spacy.lang.es import Spanish
nlp = Spanish()
doc = nlp(text_spanish)
tokens = [token.text for token in doc]
tokens[:10]

['Por', 'los', 'locos', '.', 'Los', 'marginados', '.', 'Los', 'rebeldes', '.']

## Gensim

In [30]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 6.6 MB/s eta 0:00:01:04
Installing collected packages: gensim
Successfully installed gensim-4.1.2


In [24]:
from gensim.utils import tokenize
list(tokenize(text1))

ModuleNotFoundError: No module named 'gensim'