# Natural Language Processing
## Data preprocessing

In [8]:
import string

In [9]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [15]:
str1 = "I love bubble tea! OMG so #tasty @channel XOXO @$ ^_^ "
for p in string.punctuation:
    str1 = str1.replace(p, '') 
str1

'I love bubble tea OMG so tasty channel XOXO   '

In [12]:
def basic_cleaning(sentence):
    sentence = sentence.lower()
    sentence = ''.join(char for char in sentence if not char.isdigit())
    
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') 
    
    sentence = sentence.strip()
    
    return sentence

In [13]:
sentences = ["   I LOVE Pizza 999 @^_^", 
             "  Le Wagon is amazing, take care - 666"]

In [14]:
cleaned_sentences = [basic_cleaning(sentence) for sentence in sentences]
cleaned_sentences

['i love pizza', 'le wagon is amazing take care']

In [15]:
import re

text = """<head><body>Hello Le Wagon!</body></head>"""
cleaned_text = re.sub('<[^<]+?>','', text)

print (cleaned_text)

Hello Le Wagon!


In [16]:
txt = 'This is a random text, authored by darkvador@gmail.com and batman@outlook.com, WOW!'

re.findall('[\w.+-]+@[\w-]+\.[\w.-]+', txt)

['darkvador@gmail.com', 'batman@outlook.com']

## Cleaning with NLTK

In [23]:
text = "It is during our darkest moments that we must focus to see the light"
text

'It is during our darkest moments that we must focus to see the light'

In [24]:
# ! pip install -U nltk

In [25]:
from nltk.tokenize import word_tokenize

word_tokens = word_tokenize(text)
word_tokens

['It',
 'is',
 'during',
 'our',
 'darkest',
 'moments',
 'that',
 'we',
 'must',
 'focus',
 'to',
 'see',
 'the',
 'light']

In [26]:
from nltk.corpus import stopwords 

stop_words = set(stopwords.words('english')) # you can also choose other languages

In [27]:
tokens = ["i", "am", "going", "to", "go", "to", "the", "club", "and","party", "all", "night", "long"]

In [28]:
stopwords_removed = [w for w in tokens if w in stop_words] 
stopwords_removed

['i', 'am', 'to', 'to', 'the', 'and', 'all']

In [29]:
tokens_cleaned = [w for w in tokens if not w in stop_words] 
tokens_cleaned

['going', 'go', 'club', 'party', 'night', 'long']

In [30]:
sentence = 'He was RUNNING and EATING at the same time =[. He has a bad habit of swimming after playing 3 hours in the Sun =/'

In [31]:
cleaned_sentence = basic_cleaning(sentence)
cleaned_sentence

'he was running and eating at the same time  he has a bad habit of swimming after playing  hours in the sun'

In [32]:
tokenized_sentence = word_tokenize(cleaned_sentence)
tokenized_sentence

['he',
 'was',
 'running',
 'and',
 'eating',
 'at',
 'the',
 'same',
 'time',
 'he',
 'has',
 'a',
 'bad',
 'habit',
 'of',
 'swimming',
 'after',
 'playing',
 'hours',
 'in',
 'the',
 'sun']

In [33]:
tokenized_sentence_no_stopword = [w for w in tokenized_sentence if not w in stop_words] 
tokenized_sentence_no_stopword

['running',
 'eating',
 'time',
 'bad',
 'habit',
 'swimming',
 'playing',
 'hours',
 'sun']

In [38]:
from nltk.stem.wordnet import WordNetLemmatizer

In [40]:
verb_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "v")  # v --> verbs
              for word in tokenized_sentence_no_stopword]

# 2 - Lemmatizing the nouns
noun_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "n")  # n --> nouns
              for word in verb_lemmatized]

In [41]:
original_vs_lemmatized.style.hide_index()

NameError: name 'original_vs_lemmatized' is not defined

In [42]:
texts = ['the young dog is running with the cat',
         'running is good for your health',
         'your cat is young',
         'young young young young young cat cat cat']

In [43]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
X = count_vectorizer.fit_transform(texts) # X is a sparse matrix
X.toarray()

array([[1, 1, 0, 0, 0, 1, 1, 2, 1, 1, 0],
       [0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1],
       [3, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0]])

In [44]:
count_vectorizer.get_feature_names_out()

array(['cat', 'dog', 'for', 'good', 'health', 'is', 'running', 'the',
       'with', 'young', 'your'], dtype=object)

In [46]:
import pandas as pd

vectorized_texts = pd.DataFrame(
    X.toarray(), 
    columns = count_vectorizer.get_feature_names_out(),
    index = texts
    )

vectorized_texts

Unnamed: 0,cat,dog,for,good,health,is,running,the,with,young,your
the young dog is running with the cat,1,1,0,0,0,1,1,2,1,1,0
running is good for your health,0,0,1,1,1,1,1,0,0,0,1
your cat is young,1,0,0,0,0,1,0,0,0,1,1
young young young young young cat cat cat,3,0,0,0,0,0,0,0,0,5,0


In [47]:
texts

['the young dog is running with the cat',
 'running is good for your health',
 'your cat is young',
 'young young young young young cat cat cat']

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [51]:
# Instantiating the TfidfVectorizer
tf_idf_vectorizer = TfidfVectorizer()

# Training it on the texts
weighted_words = pd.DataFrame(
    tf_idf_vectorizer.fit_transform(texts).toarray(),
    columns = tf_idf_vectorizer.get_feature_names_out(),
    index=texts)

weighted_words

Unnamed: 0,cat,dog,for,good,health,is,running,the,with,young,your
the young dog is running with the cat,0.227904,0.357056,0.0,0.0,0.0,0.227904,0.281507,0.714112,0.357056,0.227904,0.0
running is good for your health,0.0,0.0,0.463709,0.463709,0.463709,0.29598,0.365594,0.0,0.0,0.0,0.365594
your cat is young,0.470063,0.0,0.0,0.0,0.0,0.470063,0.0,0.0,0.0,0.470063,0.580622
young young young young young cat cat cat,0.514496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.857493,0.0


In [52]:
# Instantiating the TfidfVectorizer
tf_idf_vectorizer = TfidfVectorizer(min_df=0.5)

# Training it on the texts
weighted_words = pd.DataFrame(
    tf_idf_vectorizer.fit_transform(texts).toarray(),
    columns = tf_idf_vectorizer.get_feature_names_out(),
    index=texts)

weighted_words

Unnamed: 0,cat,is,running,young,your
the young dog is running with the cat,0.470063,0.470063,0.580622,0.470063,0.0
running is good for your health,0.0,0.496816,0.613667,0.0,0.613667
your cat is young,0.470063,0.470063,0.0,0.470063,0.580622
young young young young young cat cat cat,0.514496,0.0,0.0,0.857493,0.0


In [55]:
# Instantiating the TfidfVectorizer
tf_idf_vectorizer = TfidfVectorizer(max_df=0.5)

# Training it on the texts
weighted_words = pd.DataFrame(
    tf_idf_vectorizer.fit_transform(texts).toarray(),
    columns = tf_idf_vectorizer.get_feature_names_out(),
    index=texts)

weighted_words

Unnamed: 0,dog,for,good,health,running,the,with,your
the young dog is running with the cat,0.388614,0.0,0.0,0.0,0.306388,0.777229,0.388614,0.0
running is good for your health,0.0,0.485461,0.485461,0.485461,0.382743,0.0,0.0,0.382743
your cat is young,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
young young young young young cat cat cat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
# CountVectorizer with the 3 most frequent words
count_vectorizer = CountVectorizer(max_features = 3)

X = count_vectorizer.fit_transform(texts)
X = pd.DataFrame(
    X.toarray(),
     columns = count_vectorizer.get_feature_names_out(),
     index = texts
)

X

Unnamed: 0,cat,is,young
the young dog is running with the cat,1,1,1
running is good for your health,0,1,0
your cat is young,1,1,1
young young young young young cat cat cat,3,0,5


## `N-gram`

In [57]:
actors_movie = ["I like the movie but NOT the actors",
                "I like the actors but NOT the movie"]

In [59]:
# Vectorize the sentences
count_vectorizer = CountVectorizer()
actors_movie_vectorized = count_vectorizer.fit_transform(actors_movie)

# Show the representations in a nice DataFrame
actors_movie_vectorized = pd.DataFrame(
    actors_movie_vectorized.toarray(),
    columns = count_vectorizer.get_feature_names_out(),
    index = actors_movie)

# Show the vectorized movies
actors_movie_vectorized

Unnamed: 0,actors,but,like,movie,not,the
I like the movie but NOT the actors,1,1,1,1,1,2
I like the actors but NOT the movie,1,1,1,1,1,2


In [60]:
# Vectorize the sentences
count_vectorizer_n_gram = CountVectorizer(ngram_range = (2,2)) # BI-GRAMS
actors_movie_vectorized_n_gram = count_vectorizer_n_gram.fit_transform(actors_movie)

# Show the representations in a nice DataFrame
actors_movie_vectorized_n_gram = pd.DataFrame(
    actors_movie_vectorized_n_gram.toarray(),
    columns = count_vectorizer_n_gram.get_feature_names_out(),
    index = actors_movie
    )

# Show the vectorized movies with bigrams
actors_movie_vectorized_n_gram

Unnamed: 0,actors but,but not,like the,movie but,not the,the actors,the movie
I like the movie but NOT the actors,0,1,1,1,1,1,1
I like the actors but NOT the movie,1,1,1,0,1,1,1


In [None]:
import pandas as pd

data = pd.read_csv("data/emails.csv")
data.head()

In [62]:
import numpy as np

from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score

In [None]:
# Feature/Target
X = data["text"]
y = data["spam"]

In [None]:
# Pipeline vectorizer + Naive Bayes
pipeline_naive_bayes = make_pipeline(
    TfidfVectorizer(),
    MultinomialNB()
    )

In [None]:
# Cross-validation
cv_results = cross_validate(pipeline_naive_bayes, X, y, cv = 5, scoring = ["recall"])
average_recall = cv_results["test_recall"].mean()
np.round(average_recall,2)

In [64]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Define the grid of parameters
parameters = {
    'tfidfvectorizer__ngram_range': ((1,1), (2,2)),
    'multinomialnb__alpha': (0.1,1),
    }

# Perform Grid Search
grid_search = GridSearchCV(
    pipeline_naive_bayes,
    parameters,
    scoring = "recall",
    cv = 5,
    n_jobs=-1,
    verbose=1
    )

grid_search.fit(data.text,data.spam)

In [None]:
# Best score
print(f"Best Score = {grid_search.best_score_}")
# Best params
print(f"Best params = {grid_search.best_params_}")

In [67]:
documents = pd.DataFrame(['I like mangos and oranges', 'Frogs and turtles live in ponds', 'Kittens and puppies are fluffy', 'I had a spinach and kiwi smoothie', 'My kitten loves strawberries'], columns=['documents'])
documents

Unnamed: 0,documents
0,I like mangos and oranges
1,Frogs and turtles live in ponds
2,Kittens and puppies are fluffy
3,I had a spinach and kiwi smoothie
4,My kitten loves strawberries


In [68]:
def cleaning(sentence):
    
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercase 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation
    
    tokenized_sentence = word_tokenize(sentence) ## tokenize 
    stop_words = set(stopwords.words('english')) ## define stopwords
    
    tokenized_sentence_cleaned = [ ## remove stopwords
        w for w in tokenized_sentence if not w in stop_words
    ]

    lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v") 
        for word in tokenized_sentence_cleaned
    ]
    
    cleaned_sentence = ' '.join(word for word in lemmatized)
    
    return cleaned_sentence

In [69]:
cleaned_documents = documents["documents"].apply(cleaning)
cleaned_documents.head()

0         like mangos oranges
1      frog turtle live ponds
2       kitten puppies fluffy
3       spinach kiwi smoothie
4    kitten love strawberries
Name: documents, dtype: object

In [70]:
vectorizer = TfidfVectorizer()
vectorized_documents = vectorizer.fit_transform(cleaned_documents)
vectorized_documents = pd.DataFrame(
    vectorized_documents.toarray(), 
    columns = vectorizer.get_feature_names_out())

vectorized_documents

Unnamed: 0,fluffy,frog,kitten,kiwi,like,live,love,mangos,oranges,ponds,puppies,smoothie,spinach,strawberries,turtle
0,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.57735,0.57735,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.5,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5
2,0.614189,0.0,0.495524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.614189,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735,0.0,0.0
4,0.0,0.0,0.495524,0.0,0.0,0.0,0.614189,0.0,0.0,0.0,0.0,0.0,0.0,0.614189,0.0


In [85]:
from sklearn.decomposition import LatentDirichletAllocation

# Instantiate the LDA 
n_components = 2
lda_model = LatentDirichletAllocation(n_components=n_components, max_iter = 250)

# Fit the LDA on the vectorized documents
lda_model.fit(vectorized_documents)

In [86]:
document_topic_mixture = pd.DataFrame(lda_model.transform(vectorized_documents), columns=['top_0', 'top_1'])
document_topic_mixture['original_text'] = documents
document_topic_mixture

Unnamed: 0,top_0,top_1,original_text
0,0.799577,0.200423,I like mangos and oranges
1,0.179884,0.820116,Frogs and turtles live in ponds
2,0.80221,0.19779,Kittens and puppies are fluffy
3,0.19564,0.80436,I had a spinach and kiwi smoothie
4,0.80221,0.19779,My kitten loves strawberries


In [87]:
topic_word_mixture = pd.DataFrame(
    lda_model.components_, 
    columns = vectorizer.get_feature_names_out())
topic_word_mixture

Unnamed: 0,fluffy,frog,kitten,kiwi,like,live,love,mangos,oranges,ponds,puppies,smoothie,spinach,strawberries,turtle
0,1.098877,0.50989,1.474902,0.511485,1.061559,0.50989,1.098879,1.061559,1.061559,0.50989,1.098877,0.511485,0.511485,1.098879,0.50989
1,0.515312,0.99011,0.516145,1.065865,0.515791,0.99011,0.51531,0.515791,0.515791,0.99011,0.515312,1.065865,1.065865,0.51531,0.99011


In [88]:
def print_topics(lda_model, vectorizer, top_words):
    # 1. TOPIC MIXTURE OF WORDS FOR EACH TOPIC
    topic_mixture = pd.DataFrame(lda_model.components_,
                                 columns = vectorizer.get_feature_names_out())
    
    # 2. FINDING THE TOP WORDS FOR EACH TOPIC
    ## Number of topics
    n_components = topic_mixture.shape[0]
    ## Top words for each topic
    for topic in range(n_components):
        print("-"*10)
        print(f"For topic {topic}, here are the the top {top_words} words with weights:")
        topic_df = topic_mixture.iloc[topic]\
                             .sort_values(ascending = True).head(top_words)
        
        print(round(topic_df,3))

In [89]:
print_topics(lda_model, vectorizer, 5)

----------
For topic 0, here are the the top 5 words with weights:
frog      0.510
live      0.510
ponds     0.510
turtle    0.510
kiwi      0.511
Name: 0, dtype: float64
----------
For topic 1, here are the the top 5 words with weights:
love            0.515
strawberries    0.515
fluffy          0.515
puppies         0.515
like            0.516
Name: 1, dtype: float64
