<a href="https://colab.research.google.com/github/nam4dev/nlp_demonstration/blob/master/tokenization_and_lemmatization_demonstration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Import spaCy library
import spacy

In [0]:
# Load english model (en_core_web_sm)
nlp = spacy.load('en_core_web_sm')

In [0]:
test_string = "Hi, I'm giving some attention to the NLP course :)"

# Load the text into a Doc instance
doc = nlp(test_string)

In [0]:
# Extracting the tokens
tokens = [token.text for token in doc]

print(tokens)

In [0]:
# Extracting the Lemmas
lemmas = [token.lemma_ for token in doc]

print(lemmas)

In [0]:
# Import data manipulation library: pandas
import pandas as pd
# Import tweeter scraping needed libraries
import bs4
import json
import requests

In [0]:
response = requests.get('https://twitter.com/tweeter?lang=fr')
html_doc = response.text
soup = bs4.BeautifulSoup(html_doc, 'html.parser')

lines = []
for tweet in soup.find_all('p', class_='tweet-text'):
  for line in tweet.text.split('\n'):
    stripped = line.strip()
    if stripped:
      lines.append(stripped)

print('Fetched lines', json.dumps(lines, indent=4))
print()

df = pd.DataFrame(data={"tweets": lines})

print(df.head())
print()

print(df.info())

In [0]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS

# Function to preprocess text
def preprocess(text):
  	# Create Doc object
    doc = nlp(text, disable=['ner', 'parser'])
    # Generate lemmas
    lemmas = [token.lemma_ for token in doc]
    # Remove stopwords and non-alphabetic characters
    a_lemmas = [lemma for lemma in lemmas 
            if lemma.isalpha() and lemma not in stopwords]
    
    return ' '.join(a_lemmas)
  
# Apply preprocess to df['transcript']
df['transcript'] = df['tweets'].apply(preprocess)
print(df['transcript'])