In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Read my article on NLP and use this code for better understanding


In [None]:
# In any machine learning task, cleaning or preprocessing the data is as important as model building if not more. And when it comes to unstructured data like text, this process is even more important.

# Objective of this code is to understand the various text preprocessing steps with examples.

# Some of the common text preprocessing / cleaning steps are:

# * Lower casing
# * Removal of Punctuations
# * Removal of Stopwords
# * Removal of Frequent words
# * Removal of Rare words
# * Stemming
# * Lemmatization

# So these are the different types of text preprocessing steps which we can do on text data. But we need not do all of these all the times. We need to carefully choose the preprocessing steps based on our use case since that also play an important role.

# For example, in sentiment analysis use case, we need not remove the emojis or emoticons as it will convey some important information about the sentiment. Similarly we need to decide based on our use cases.

In [None]:
import pandas as pd
import re
import nltk
import spacy
import string
import warnings
warnings.filterwarnings('ignore')

In [None]:
full_df = pd.read_csv("/kaggle/input/pfizer-vaccine-tweets/vaccination_tweets.csv")
df = full_df[["text"]]
df["text"] = df["text"].astype(str)
full_df.head()

In [None]:
# We will work only on how to do text preprocessing the tweet texts available in the dataset
df.head()

In [None]:
# 1. Lowercase conversion
df["text_lower"] = df["text"].str.lower()
df.head()

In [None]:
full_df.head()

In [None]:
# 2. Remove stopwords
# Import stopwords from nltk
from nltk.corpus import stopwords

# list of stopwords
", ".join(stopwords.words("english"))

In [None]:
Stopwords = set(stopwords.words("english"))

def remove_stopwords(text):
    return " ".join([words for words in str(text).split() if words not in Stopwords])

df["text_without_stopwords"] = df["text_lower"].apply(lambda text: remove_stopwords(text))
df.head()

In [None]:
# 3. Remove Punctuations

Punctuations = string.punctuation
print(Punctuations)

In [None]:
def remove_punctuations(text):
    return text.translate(str.maketrans("", "", Punctuations))

df["text_without_punctuations"] = df["text_without_stopwords"].apply(lambda text: remove_punctuations(text))
df.head()

In [None]:
# Remove Frequent words
from collections import Counter
Count = Counter()
for text in df["text_without_punctuations"].values:
    for word in text.split():
        Count[word] += 1
        
Count.most_common(10)

In [None]:
# Removal of rare words

Frequent_Words = set([w for (w, wc) in Count.most_common(10)])
def remove_freqwords(text):
    return " ".join([word for word in str(text).split() if word not in Frequent_Words])

df["text_without_stopfreq"] = df["text_without_punctuations"].apply(lambda text: remove_freqwords(text))
df.head()

In [None]:
#Snowball Stemmer
from nltk.stem.snowball import SnowballStemmer

# Drop the two columns 
#df.drop(["text_wo_stopfreq", "text_wo_stopfreqrare"], axis=1, inplace=True) 
stemmer = SnowballStemmer("english")
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

df["text_stemmed"] = df["text_without_stopfreq"].apply(lambda text: stem_words(text))
df.head()

In [None]:
#  Lemmatization

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

df["text_lemmatized"] = df["text_without_punctuations"].apply(lambda text: lemmatize_words(text))
df.head() 