### Stemming

In [1]:
import nltk
from nltk.stem import PorterStemmer

stemmerPorter=PorterStemmer()
stemmerPorter.stem('Lying')

'ly'

In [2]:
from nltk.stem import LancasterStemmer
stemmerLancaster=LancasterStemmer()
stemmerLancaster.stem('watering')

'wat'

In [3]:
from nltk.stem import RegexpStemmer
stemmerRegex=RegexpStemmer('ing')
stemmerRegex.stem('singing')

's'

In [4]:
from nltk.stem import SnowballStemmer
englishSnow=SnowballStemmer('english')
len(SnowballStemmer.languages)

16

In [5]:
frenchStemmer=SnowballStemmer('french')
frenchStemmer.stem('manges')

'mang'

In [6]:
germanStemmer=SnowballStemmer('german')
germanStemmer.stem('sitzt') # actual stem => sitz

'sitzt'

### Performance of various stemmers

Comparing the performance of various stemmers using a ambigous word-list

In [7]:
comp_words=['association','country','organizing','generous','publisher','importer','namely']

In [8]:
[stemmerPorter.stem(i) for i in comp_words] # 3 out of 8

['associ', 'countri', 'organ', 'gener', 'publish', 'import', 'name']

In [9]:
[stemmerLancaster.stem(i) for i in comp_words] # 2 out of 8

['assocy', 'country', 'org', 'gen', 'publ', 'import', 'nam']

In [10]:
[englishSnow.stem(i) for i in comp_words] # 3 out of 8

['associ', 'countri', 'organ', 'generous', 'publish', 'import', 'name']

### Lemmatization

In [11]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [12]:
# pos=a for adjective
print("Good -> {}".format(lemmatizer.lemmatize("better",pos='a'))) 
      
# pos=v for verb
print("Solving -> {}".format(lemmatizer.lemmatize("solving",pos='v')))

# pos=n for noun
print("Cacti -> {}".format(lemmatizer.lemmatize("cacti",pos='n')))

Good -> good
Solving -> solve
Cacti -> cactus


### Compare Stemming and Lemmatization

In [13]:
example=""""Jordan in 2011 ,Jordan worked as a child model for several companies 
and brands, including Modell's sporting goods and Toys 'R' Us, before deciding 
to embark on a career as an actor. He launched his career as a professional 
actor in 1999, when he appeared briefly in single episodes of the television 
series Cosby and The Sopranos. His first principal film role followed in 2001 
when he was featured in Hardball, which starred Keanu Reeves."""

In [14]:
tokens=example.split(" ")

In [15]:
stemmedWords=[stemmerPorter.stem(token) for token in tokens]  # list comprehension for stemming words

In [16]:
lemmatizedWords=[lemmatizer.lemmatize(token) for token in tokens]  # list comprehension for lemmatizing words

In [17]:
difference=list(set(lemmatizedWords)-set(stemmedWords))

In [18]:
len(difference) # difference between the stemmed words and lemmatized words

33

## Vectorization


### CountVectoriser

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
cVect=CountVectorizer(stop_words="english")

In [20]:
example=example.split(".")[:2]

In [21]:
words=cVect.fit_transform(example)

In [22]:
words.shape # rows and columns of the matrix

(2, 27)

In [23]:
len(cVect.get_feature_names()) # features in the text

27

In [24]:
cVect.vocabulary_ # the list of features and their number

{'jordan': 15,
 '2011': 1,
 'worked': 26,
 'child': 7,
 'model': 17,
 'companies': 8,
 'brands': 4,
 'including': 14,
 'modell': 18,
 'sporting': 23,
 'goods': 13,
 'toys': 25,
 'deciding': 10,
 'embark': 11,
 'career': 6,
 'actor': 2,
 'launched': 16,
 'professional': 19,
 '1999': 0,
 'appeared': 3,
 'briefly': 5,
 'single': 21,
 'episodes': 12,
 'television': 24,
 'series': 20,
 'cosby': 9,
 'sopranos': 22}

### TFIDF Vectorizer

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfVect=TfidfVectorizer(smooth_idf=False,analyzer='word')

In [26]:
words=tfVect.fit_transform(example)

In [27]:
tfVect.vocabulary_

{'jordan': 23,
 'in': 21,
 '2011': 1,
 'worked': 41,
 'as': 6,
 'child': 11,
 'model': 25,
 'for': 17,
 'several': 31,
 'companies': 12,
 'and': 4,
 'brands': 8,
 'including': 22,
 'modell': 26,
 'sporting': 34,
 'goods': 18,
 'toys': 38,
 'us': 39,
 'before': 7,
 'deciding': 14,
 'to': 37,
 'embark': 15,
 'on': 28,
 'career': 10,
 'an': 3,
 'actor': 2,
 'he': 19,
 'launched': 24,
 'his': 20,
 'professional': 29,
 '1999': 0,
 'when': 40,
 'appeared': 5,
 'briefly': 9,
 'single': 32,
 'episodes': 16,
 'of': 27,
 'the': 36,
 'television': 35,
 'series': 30,
 'cosby': 13,
 'sopranos': 33}

In [28]:
print(dict(zip(tfVect.get_feature_names(),tfVect.idf_)))

{'1999': 1.6931471805599454, '2011': 1.6931471805599454, 'actor': 1.0, 'an': 1.6931471805599454, 'and': 1.0, 'appeared': 1.6931471805599454, 'as': 1.0, 'before': 1.6931471805599454, 'brands': 1.6931471805599454, 'briefly': 1.6931471805599454, 'career': 1.0, 'child': 1.6931471805599454, 'companies': 1.6931471805599454, 'cosby': 1.6931471805599454, 'deciding': 1.6931471805599454, 'embark': 1.6931471805599454, 'episodes': 1.6931471805599454, 'for': 1.6931471805599454, 'goods': 1.6931471805599454, 'he': 1.6931471805599454, 'his': 1.6931471805599454, 'in': 1.0, 'including': 1.6931471805599454, 'jordan': 1.6931471805599454, 'launched': 1.6931471805599454, 'model': 1.6931471805599454, 'modell': 1.6931471805599454, 'of': 1.6931471805599454, 'on': 1.6931471805599454, 'professional': 1.6931471805599454, 'series': 1.6931471805599454, 'several': 1.6931471805599454, 'single': 1.6931471805599454, 'sopranos': 1.6931471805599454, 'sporting': 1.6931471805599454, 'television': 1.6931471805599454, 'the':

The words which occur more number of times have more IDF value