# Experiment 03

In [None]:
!pip install nltk



### Text

In [None]:
text = 'The general trend in IR systems over time has been from standard use of quite large stop lists (200-300 terms) to very small stop lists (7-12 terms) to no stop list whatsoever. Web search engines generally do not use stop lists.'

In [None]:
text

'The general trend in IR systems over time has been from standard use of quite large stop lists (200-300 terms) to very small stop lists (7-12 terms) to no stop list whatsoever. Web search engines generally do not use stop lists.'

### Stopwords

In [None]:
from nltk.corpus import stopwords

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
stop_words = stopwords.words('english')

In [None]:
from nltk.tokenize import word_tokenize
words = word_tokenize(text)

#### Applying stop words

In [None]:
holder = list()
for w in words:
    if w not in set(stop_words):
        holder.append(w)

In [None]:
holder

['The',
 'general',
 'trend',
 'IR',
 'systems',
 'time',
 'standard',
 'use',
 'quite',
 'large',
 'stop',
 'lists',
 '(',
 '200-300',
 'terms',
 ')',
 'small',
 'stop',
 'lists',
 '(',
 '7-12',
 'terms',
 ')',
 'stop',
 'list',
 'whatsoever',
 '.',
 'Web',
 'search',
 'engines',
 'generally',
 'use',
 'stop',
 'lists',
 '.']

#### List Comprehension for stop words

In [None]:
holder = [w for w in words if w not in set(stop_words)]
print(holder)

['The', 'general', 'trend', 'IR', 'systems', 'time', 'standard', 'use', 'quite', 'large', 'stop', 'lists', '(', '200-300', 'terms', ')', 'small', 'stop', 'lists', '(', '7-12', 'terms', ')', 'stop', 'list', 'whatsoever', '.', 'Web', 'search', 'engines', 'generally', 'use', 'stop', 'lists', '.']


### Stemming

In [None]:
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer

In [None]:
porter = PorterStemmer()
snow = SnowballStemmer(language = 'english')
lancaster = LancasterStemmer()

In [None]:
words = ['play', 'plays', 'played', 'playing', 'player']

#### Porter Stemmer

In [None]:
porter_stemmed = list()
for w in words:
    stemmed_words = porter.stem(w)
    porter_stemmed.append(stemmed_words)

In [None]:
porter_stemmed

['play', 'play', 'play', 'play', 'player']

#### Porter Stemmer List Comprehension

In [None]:
porter_stemmed = [porter.stem(x) for x in words]
print (porter_stemmed)

['play', 'play', 'play', 'play', 'player']


#### Snowball Stemmer

In [None]:
snow_stemmed = list()
for w in words:
    stemmed_words = snow.stem(w)
    snow_stemmed.append(stemmed_words)

In [None]:
snow_stemmed

['play', 'play', 'play', 'play', 'player']

#### Snowball Stemmer List Comprehension

In [None]:
snow_stemmed = [snow.stem(x) for x in words]
print (snow_stemmed)

['play', 'play', 'play', 'play', 'player']


#### Lancaster Stemmer

In [None]:
lancaster_stemmed = list()
for w in words:
    stemmed_words = lancaster.stem(w)
    lancaster_stemmed.append(stemmed_words)

In [None]:
lancaster_stemmed

['play', 'play', 'play', 'play', 'play']

#### Lancaster Stemmer List Comprehension

In [None]:
lancaster_stemmed = [lancaster.stem(x) for x in words]
print (lancaster_stemmed)

['play', 'play', 'play', 'play', 'play']


### Lemmatization : This has a more expansive vocabulary than Stemming

In [None]:
from nltk.stem import WordNetLemmatizer
wordnet = WordNetLemmatizer()

In [None]:
lemmatized = [wordnet.lemmatize(x) for x in words]

In [None]:
lemmatized

['play', 'play', 'played', 'playing', 'player']