<big><big><big><b>NLP Lecture

In [43]:
import nltk
import pandas as pd

# Text Preprocessing

In [44]:
text = '999 Football is my passion. I love to play with footballs!! Who else loves football?'
text

'999 Football is my passion. I love to play with footballs!! Who else loves football?'

## Lowercase

In [45]:
text = text.lower() 
text

'999 football is my passion. i love to play with footballs!! who else loves football?'

## Numbers

In [46]:
text = ''.join([letter for letter in text if not letter.isdigit()])

In [47]:
text

' football is my passion. i love to play with footballs!! who else loves football?'

## Punctuation

In [48]:
import string 

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [49]:
for punctuation in string.punctuation:
    text = text.replace(punctuation, '') 

text

' football is my passion i love to play with footballs who else loves football'

## Tokenize

In [50]:
text

' football is my passion i love to play with footballs who else loves football'

In [51]:
from nltk.tokenize import word_tokenize

word_tokens = word_tokenize(text) 

word_tokens

['football',
 'is',
 'my',
 'passion',
 'i',
 'love',
 'to',
 'play',
 'with',
 'footballs',
 'who',
 'else',
 'loves',
 'football']

## Stop-Words

In [52]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english')) 
stop_words


{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [53]:
text = [w for w in word_tokens if not w in stop_words] 
  
text

['football',
 'passion',
 'love',
 'play',
 'footballs',
 'else',
 'loves',
 'football']

## Stemming & Lemmatizing

### Setemming

In [54]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

stemmed = [stemmer.stem(word) for word in text]

stemmed

['footbal', 'passion', 'love', 'play', 'footbal', 'els', 'love', 'footbal']

### Lemmatizing

In [55]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

lemmatized = [lemmatizer.lemmatize(word) for word in text]

text = lemmatized

lemmatized

['football', 'passion', 'love', 'play', 'football', 'else', 'love', 'football']

In [56]:
lemmatizer.lemmatize("hanging","v")

'hang'

# Vectorizing

In [57]:
texts = ['i love football',
         'football is a game i love',
        'football football football']

## Bag of words

Use _sklearn.feature_extraction.text.<mark><b>CountVectorizer</b></mark>_

In [62]:
from sklearn.feature_extraction.text import CountVectorizer

vect_bow = CountVectorizer()

X = vect_bow.fit_transform(texts).toarray() # CountVectorize take in input a list of texts 


In [63]:
vect_bow.get_feature_names()

['football', 'game', 'is', 'love']

In [64]:
import pandas as pd
pd.DataFrame(X,columns=vect_bow.get_feature_names())

Unnamed: 0,football,game,is,love
0,1,0,0,1
1,1,1,1,1
2,3,0,0,0


## Tf-Idf representation

In [65]:
texts

['i love football', 'football is a game i love', 'football football football']

Use _sklearn.feature_extraction.text.<mark><b>TfidfVectorizer</b></mark>_

In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer

texts = ['i love football',
         'football is a game i love',
        'football football football']

tf_idf_vectorizer = TfidfVectorizer()

X = tf_idf_vectorizer.fit_transform(texts) # 

X

<3x4 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

To Vizualize The data we can use `.toarray`. But as input to model it is better to keep the data <mark><b>sparsed</b></mark>

In [71]:
pd.DataFrame(X.toarray(),columns=tf_idf_vectorizer.get_feature_names())

Unnamed: 0,football,game,is,love
0,0.613356,0.0,0.0,0.789807
1,0.345205,0.584483,0.584483,0.444514
2,1.0,0.0,0.0,0.0


### `max_df`

Used to exclude <mark>"corpus specific stopwords"</mark>, words that are `very frequent` in the dataset

In [72]:
texts

['i love football', 'football is a game i love', 'football football football']

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vectorizer = TfidfVectorizer(max_df = 0.8)

X = tf_idf_vectorizer.fit_transform(texts)

X.toarray()

pd.DataFrame(X.toarray(),columns = tf_idf_vectorizer.get_feature_names())

Unnamed: 0,game,is,love
0,0.0,0.0,1.0
1,0.622766,0.622766,0.47363
2,0.0,0.0,0.0


<big>👉 Particularly useful to remove words that are so frequent they have little predictive power.

Example: When classifying texts into topics Basketball or football, the word "ball" will appear often, but won't be useful to predict one or the other.

### `min_df`

Used to <mark>exclude words that are very infrequent</mark> in the dataset. 

In [74]:
texts

['i love football', 'football is a game i love', 'football football football']

In [75]:
tf_idf_vectorizer = TfidfVectorizer(min_df = 0.5)

X = tf_idf_vectorizer.fit_transform(texts)

X.toarray()

pd.DataFrame(X.toarray(),columns = tf_idf_vectorizer.get_feature_names())

Unnamed: 0,football,love
0,0.613356,0.789807
1,0.613356,0.789807
2,1.0,0.0


<big>👉 Particularly useful to remove typos or text anomalies missed during preprocessing.

### `max_features`

Used to specify the <mark>number of features to keep</mark> when vectorizing. It will retain the top features according to count or tf-idf score.

In [76]:
texts

['i love football', 'football is a game i love', 'football football football']

In [77]:
tf_idf_vectorizer = TfidfVectorizer(max_df = 0.8)

X = tf_idf_vectorizer.fit_transform(texts)

X.toarray()

pd.DataFrame(X.toarray(),columns = tf_idf_vectorizer.get_feature_names())

Unnamed: 0,game,is,love
0,0.0,0.0,1.0
1,0.622766,0.622766,0.47363
2,0.0,0.0,0.0


<big>👉 Particularly useful to reduce the dimension of the data.

## N-Gram representation

In [78]:
texts =  ['i do not love football',
         'i love football not basketball']

In [79]:
tf_idf_vectorizer = TfidfVectorizer(ngram_range = (2,2))

X = tf_idf_vectorizer.fit_transform(texts)

X.toarray()

pd.DataFrame(X.toarray(),columns = tf_idf_vectorizer.get_feature_names())

Unnamed: 0,do not,football not,love football,not basketball,not love
0,0.631667,0.0,0.449436,0.0,0.631667
1,0.0,0.631667,0.449436,0.631667,0.0


## Feature engineering

### Vocabulary Richness

In [80]:
data = pd.DataFrame(['i do not love football',
                     'i love football not basketball',
                    'Football football FootBall'],columns=['text'])
data

Unnamed: 0,text
0,i do not love football
1,i love football not basketball
2,Football football FootBall


In [81]:
def vocab_richness(text):
    text = text.lower()
    tokens = word_tokenize(text)
    total_length = len(tokens)
    unique_words = set(tokens)
    unique_word_length = len(unique_words)
    return unique_word_length/total_length

#data['vocab richness'] = data.text.apply(vocab_richness)

data

Unnamed: 0,text
0,i do not love football
1,i love football not basketball
2,Football football FootBall


# (Multinomial) Naive Bayes Algorithm

In [82]:
data = pd.read_csv("emails.csv")
data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


## Modelling Implementation

In [83]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(data.text)

y = data.spam

nb_model = MultinomialNB()

nb_model.fit(X,y)

nb_model.score(X,y)

0.9071229050279329

## Tuning vectorizer and model simultanously

In [84]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Create Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

# Set parameters to search
parameters = {
    'tfidf__ngram_range': ((1,1), (2,2)),
    'nb__alpha': (0.1,1),}

# Perform grid search
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, 
                           verbose=1, scoring = "accuracy", 
                           refit=True, cv=5)

grid_search.fit(data.text,y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'nb__alpha': (0.1, 1),
                         'tfidf__ngram_range': ((1, 1), (2, 2))},
             scoring='accuracy', verbose=1)

In [85]:
grid_search.best_params_

{'nb__alpha': 0.1, 'tfidf__ngram_range': (1, 1)}

In [86]:
grid_search.best_score_

0.9881289771904556

## Combining vectorizer output and engineered features

In [87]:
data['vocab_richness'] = data.text.apply(vocab_richness)

data.head()

Unnamed: 0,text,spam,vocab_richness
0,Subject: naturally irresistible your corporate...,1,0.427692
1,Subject: the stock trading gunslinger fanny i...,1,0.733333
2,Subject: unbelievable new homes made easy im ...,1,0.806818
3,Subject: 4 color printing special request add...,1,0.59596
4,"Subject: do not have money , get software cds ...",1,0.792453


In [88]:
from sklearn.compose import ColumnTransformer

column_trans = ColumnTransformer([('vec', CountVectorizer(), 'text')]
                                 , remainder='passthrough')

X_combined = column_trans.fit_transform(data[['text','vocab_richness']])

# Latent Dirichlet Allocation

In [89]:
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = TfidfVectorizer().fit(data['text'])

data_vectorized = vectorizer.transform(data['text'])

lda_model = LatentDirichletAllocation(n_components=2).fit(data_vectorized)

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        

print_topics(lda_model, vectorizer)

Topic 0:
[('95', 12.792048567001473), ('save', 9.245107638642253), ('andmanyother', 8.629491354823395), ('localized', 6.996871103699615), ('worldwide', 6.624492528316986), ('over', 6.606029188783462), ('oniine', 6.345210356476206), ('total', 6.193714127809999), ('nice', 5.805118320372363), ('mx', 5.7851711549598095)]
Topic 1:
[('the', 520.4368076760244), ('to', 466.72234261522254), ('and', 319.4884528513079), ('you', 291.7948850733383), ('of', 276.17145586075304), ('ect', 240.6488094654753), ('for', 227.45001056459006), ('in', 227.30945650197177), ('enron', 218.06561602113126), ('your', 188.58332206421363)]


In [90]:
example = ["rice var congratulations save upenn"]

example_vectorized = vectorizer.transform(example)

lda_vectors = lda_model.transform(example_vectorized)

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])

topic 0 : 0.17059808968716306
topic 1 : 0.8294019103128369
