In [24]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import LineTokenizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import string
import itertools
import re
from collections import Counter

In [25]:
# https://www.kaggle.com/code/sudalairajkumar/getting-started-with-text-preprocessing

In [29]:
documents = [
    'I love marketing',
    "She doesn't do anything with marketing",
    "SEO is not something we use in marketing",
    "Why are you going away from Paris!",
    "Give me a break please",
    "What is your email? Mines is fabulous@gmail.com"
]

Remove punctuation from the text. However, we should not remove "@" which can undo emai adresses

In [30]:
def normalize_text(text):
    text = text.lower().strip()
    return text


result = list(map(lambda x: normalize_text(x), documents))
print(result)


['i love marketing', "she doesn't do anything with marketing", 'seo is not something we use in marketing', 'why are you going away from paris!', 'give me a break please', 'what is your email? mines is fabulous@gmail.com']


In [31]:
def remove_punctuation(text):
    punctuation = string.punctuation.replace('@', '')
    return text.translate(str.maketrans('', '', punctuation))

result = list(map(lambda x: remove_punctuation(x), result))
print(result)




['i love marketing', 'she doesnt do anything with marketing', 'seo is not something we use in marketing', 'why are you going away from paris', 'give me a break please', 'what is your email mines is fabulous@gmailcom']


Remove all the stop words from the documents

In [34]:
with open('../kryptone/data/stop_words_english.txt', mode='r', encoding='utf-8') as f:
    text = f.read()
    tokenizer = TfidfVectorizer().build_tokenizer()
    stop_words = [tokenizer(word) for word in text.split('\n')]
    # tokenizer = LineTokenizer()
    # stop_words = tokenizer.tokenize(text)
    stop_words = list(itertools.chain(*stop_words))


In [35]:
def remove_stop_words(text):
    tokens = text.split(' ')
    words = [token for token in tokens if token not in stop_words]
    return ' '.join(words)

result = list(map(lambda x: remove_stop_words(x), result))
print(result)

['i love marketing', 'doesnt marketing', 'seo marketing', 'going away paris', 'give a break', 'email mines fabulous@gmailcom']


In [8]:
# stemmer = SnowballStemmer('french')

# For english
stemmer = PorterStemmer()
# For other languages
stemmer = SnowballStemmer('english')

def text_preprocessor(text):
    # Remove special carachters
    text = re.sub('\W', ' ', text)

    # Use the stem of each words
    words = re.split('\s+', text)
    stemmed_words = [stemmer.stem(word=word) for word in words]
    return ' '.join(stemmed_words)

Additionnally we can remove most common and rare words from the documents

In [36]:
def remove_most_common_words(text):
    pass

In [37]:
def remove_rare_words(text):
    tokens = text.split(' ')
    counter = Counter(tokens)
    rare_words = counter.most_common()[:-5:-1]
    print(rare_words)
    rare_words = set([rare_word[0] for rare_word in rare_words])
    return ' '.join([token for token in tokens if token not in rare_words])


Create the stems for each words. For example, __walks__ and __walking__ would provide __walk__

In [38]:
def tokenize_and_stem(text):
    tokens = []
    for sentence in sent_tokenize(text):
        for token in word_tokenize(sentence):
            tokens.append(token)

    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)

    # exclude stopwords from stemmed words
    stems = []
    for token in filtered_tokens:
        if token in stop_words:
            continue
        stems.append(stemmer.stem(token))
    return stems


result = list(map(lambda x: tokenize_and_stem(x), result))
print(result)


LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\pendenquej/nltk_data'
    - 'c:\\Users\\pendenquej\\AppData\\Local\\Programs\\Python\\Python311\\nltk_data'
    - 'c:\\Users\\pendenquej\\AppData\\Local\\Programs\\Python\\Python311\\share\\nltk_data'
    - 'c:\\Users\\pendenquej\\AppData\\Local\\Programs\\Python\\Python311\\lib\\nltk_data'
    - 'C:\\Users\\pendenquej\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [10]:
vectorizer = TfidfVectorizer(
    # stop_words=stop_words,
    tokenizer=tokenize_and_stem,
    max_features=10,
    max_df=0.8,
    preprocessor=text_preprocessor
)

In [281]:
matrix = vectorizer.fit_transform(result)

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\Pende/nltk_data'
    - 'c:\\Users\\Pende\\AppData\\Local\\Programs\\Python\\Python311\\nltk_data'
    - 'c:\\Users\\Pende\\AppData\\Local\\Programs\\Python\\Python311\\share\\nltk_data'
    - 'c:\\Users\\Pende\\AppData\\Local\\Programs\\Python\\Python311\\lib\\nltk_data'
    - 'C:\\Users\\Pende\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [259]:
matrix.vocabulary_

AttributeError: vocabulary_ not found