# NLP

In [55]:
text = "You can do this. Don’t tell people your plans. Show them your results. No pressure, no diamonds. Try again. Fail again. Fail better. Win strong."

In [56]:
text = text.lower()
text

'you can do this. don’t tell people your plans. show them your results. no pressure, no diamonds. try again. fail again. fail better. win strong.'

## 1. Imports

In [57]:
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk

## 2. Tokenize

In [58]:
sent_tokens = sent_tokenize(text)
sent_tokens

['you can do this.',
 'don’t tell people your plans.',
 'show them your results.',
 'no pressure, no diamonds.',
 'try again.',
 'fail again.',
 'fail better.',
 'win strong.']

In [59]:
word_tokens = word_tokenize(text)
word_tokens

['you',
 'can',
 'do',
 'this',
 '.',
 'don',
 '’',
 't',
 'tell',
 'people',
 'your',
 'plans',
 '.',
 'show',
 'them',
 'your',
 'results',
 '.',
 'no',
 'pressure',
 ',',
 'no',
 'diamonds',
 '.',
 'try',
 'again',
 '.',
 'fail',
 'again',
 '.',
 'fail',
 'better',
 '.',
 'win',
 'strong',
 '.']

## 3. Filtering Stopwords

In [60]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Dip-
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [61]:
from nltk.corpus import stopwords

In [62]:
stop_words = set(stopwords.words('english'))

In [63]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [67]:
filtered_word_token = [
    word for word in word_tokens if word.casefold() not in stop_words and word not in ['.', ',', '’']
]
filtered_word_token

['tell',
 'people',
 'plans',
 'show',
 'results',
 'pressure',
 'diamonds',
 'try',
 'fail',
 'fail',
 'better',
 'win',
 'strong']

In [68]:
filtered_sent_token = [
    [word for word in word_tokenize(sentance) if word.casefold() not in stop_words and word not in ['.', ',', '’']]
    for sentance in sent_tokens
]
filtered_sent_token

[[],
 ['tell', 'people', 'plans'],
 ['show', 'results'],
 ['pressure', 'diamonds'],
 ['try'],
 ['fail'],
 ['fail', 'better'],
 ['win', 'strong']]

## 4. Stemming & Lemmatization

In [69]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to C:\Users\Dip-
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Dip-
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [71]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [72]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [76]:
stemmed_word_token = [
    stemmer.stem(word) for word in filtered_word_token
]
stemmed_word_token

['tell',
 'peopl',
 'plan',
 'show',
 'result',
 'pressur',
 'diamond',
 'tri',
 'fail',
 'fail',
 'better',
 'win',
 'strong']

In [80]:
lemmatized_word_token = [
    lemmatizer.lemmatize(word) for word in filtered_word_token
]
lemmatized_word_token

['tell',
 'people',
 'plan',
 'show',
 'result',
 'pressure',
 'diamond',
 'try',
 'fail',
 'fail',
 'better',
 'win',
 'strong']

In [81]:
lemmatize_sent_tokens = [
    [lemmatizer.lemmatize(word) for word in sentance ]
    for sentance in filtered_sent_token
]
lemmatize_sent_tokens

[[],
 ['tell', 'people', 'plan'],
 ['show', 'result'],
 ['pressure', 'diamond'],
 ['try'],
 ['fail'],
 ['fail', 'better'],
 ['win', 'strong']]