<a href="https://colab.research.google.com/github/sahanyafernando/My_NLP_Learning/blob/main/NLP_Learning/Lowecasing_StopwordRemoval_Tokenization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Demonstration: Lowercasing, Stopword Removal, Tokenization and One-Hot Encoding

###Lowercasing, Stopword Removal and Tokenization

In [None]:
import pandas as pd
# Load dataset
df = pd.read_csv("text_classification.csv")
print("Original Dataset:")
print(df.head())

Original Dataset:
                                        text  category
0  The stock market rose by 300 points today   finance
1   Manchester United won the football match    sports
2  New research shows benefits of meditation    health
3            The government passed a new law  politics
4      Scientists discovered a new exoplanet   science


In [None]:
import nltk
import spacy
nlp = spacy.load("en_core_web_sm")

### Lowercasing

In [None]:
# This outputs one colunm with its lower case form
df["Lowercase_Pandas"] = df["text"].str.lower()
print(df[["Lowercase_Pandas"]].head())


                            Lowercase_Pandas
0  the stock market rose by 300 points today
1   manchester united won the football match
2  new research shows benefits of meditation
3            the government passed a new law
4      scientists discovered a new exoplanet


In [None]:
import pandas as pd
# if we want to get the 2 colunms with their lowercase form then
df["text_lower"] = df["text"].str.lower()
df["category_lower"] = df["category"].str.lower()
print(df[["text_lower", "category_lower"]].head())

                                  text_lower category_lower
0  the stock market rose by 300 points today        finance
1   manchester united won the football match         sports
2  new research shows benefits of meditation         health
3            the government passed a new law       politics
4      scientists discovered a new exoplanet        science


In [None]:
# Using spaCy
df = pd.read_csv("faq_dataset.csv")

def lowercase_spacy(text):
  tokens = [token.text.lower() for token in nlp(text)]
  return " ".join(tokens)    # if we want we can add separate character like " "
df["Lowercase_spaCy"] = df["question"].apply(lowercase_spacy)
print(df[["Lowercase_spaCy"]].head())

                 Lowercase_spaCy
0                  what is nlp ?
1         who developed python ?
2     what is a neural network ?
3   what is sentiment analysis ?
4  what is tokenization in nlp ?


### Stop Word Removal

In [None]:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_word_nltk = set(stopwords.words("english"))
stop_words_spacy = nlp.Defaults.stop_words

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
# Load CSV
df = pd.read_csv("text_classification.csv")

# Convert 'text' column to lowercase
df["text_lower"] = df["text"].str.lower()

# Using NLTK
def remove_stopwords_nltk(text):
  words = word_tokenize(text)
  return " ".join([word for word in words if word not in stop_word_nltk])
df["No_Stopwords_NLTK"] = df["text_lower"].apply(remove_stopwords_nltk)
print(df[["No_Stopwords_NLTK"]].head())


                        No_Stopwords_NLTK
0      stock market rose 300 points today
1        manchester united football match
2  new research shows benefits meditation
3               government passed new law
4     scientists discovered new exoplanet


### One Hot Encoding

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Using NLTK  tokenize
from nltk.tokenize import word_tokenize
df["Tokenized_NLTK"] = df["text_lower"].apply(word_tokenize)
print(df["Tokenized_NLTK"].head())




0    [the, stock, market, rose, by, 300, points, to...
1      [manchester, united, won, the, football, match]
2     [new, research, shows, benefits, of, meditation]
3               [the, government, passed, a, new, law]
4          [scientists, discovered, a, new, exoplanet]
Name: Tokenized_NLTK, dtype: object


In [None]:
mlb = MultiLabelBinarizer()
df_onehot_nltk = pd.DataFrame(mlb.fit_transform(df["Tokenized_NLTK"]), columns = mlb.classes_)
print("- Using NLTK:")
print(df_onehot_nltk.head())


- Using NLTK:
   300  a  benefits  by  discovered  exoplanet  football  government  law  \
0    1  0         0   1           0          0         0           0    0   
1    0  0         0   0           0          0         1           0    0   
2    0  0         1   0           0          0         0           0    0   
3    0  1         0   0           0          0         0           1    1   
4    0  1         0   0           1          1         0           0    0   

   manchester  ...  points  research  rose  scientists  shows  stock  the  \
0           0  ...       1         0     1           0      0      1    1   
1           1  ...       0         0     0           0      0      0    1   
2           0  ...       0         1     0           0      1      0    0   
3           0  ...       0         0     0           0      0      0    1   
4           0  ...       0         0     0           1      0      0    0   

   today  united  won  
0      1       0    0  
1      0    

In [None]:
# Using Scikit-learn
vectorizer = CountVectorizer()
df_onehot_sklearn = pd.DataFrame(vectorizer.fit_transform(df["No_Stopwords_NLTK"]).toarray(), columns = vectorizer.get_feature_names_out())
print("- Using Scikit-learn:")
print(df_onehot_sklearn.head())

- Using Scikit-learn:
   300  benefits  discovered  exoplanet  football  government  law  \
0    1         0           0          0         0           0    0   
1    0         0           0          0         1           0    0   
2    0         1           0          0         0           0    0   
3    0         0           0          0         0           1    1   
4    0         0           1          1         0           0    0   

   manchester  market  match  ...  new  passed  points  research  rose  \
0           0       1      0  ...    0       0       1         0     1   
1           1       0      1  ...    0       0       0         0     0   
2           0       0      0  ...    1       0       0         1     0   
3           0       0      0  ...    1       1       0         0     0   
4           0       0      0  ...    1       0       0         0     0   

   scientists  shows  stock  today  united  
0           0      0      1      1       0  
1           0      0  

In [None]:
df.to_csv("processed_dataset.csv", index = False)
print("Preprocessing complete. Saved as processed_dataset.csv.")

Preprocessing complete. Saved as processed_dataset.csv.


# Demonstration : Word Tokenization

In [1]:
import pandas as pd # Import pandas for handling CSV data
import nltk # Import NLTK for natural language processing
from nltk.tokenize import word_tokenize, sent_tokenize # Import tokenization functions from nltk
from transformers import BertTokenizer # Import BERT tokenizer for subword tokenization

# Download necessary resources for NLTK tokenization
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [2]:
# Load dataset from CSV file
file_path = "faq_dataset.csv"
df = pd.read_csv(file_path) # Read CSV file into a DataFrame

In [3]:
# Ensure 'faq_Review' column exists in the dataset
if 'faq_Review' not in df.columns:
    raise ValueError("Error: 'faq_Review' column not found in the dataset.")


ValueError: Error: 'faq_Review' column not found in the dataset.

In [4]:
# Display first few rows of the dataset to understand its structure
print("Original Dataset:")
print(df.head())

Original Dataset:
                       question  \
0                  What is NLP?   
1         Who developed Python?   
2     What is a neural network?   
3   What is sentiment analysis?   
4  What is tokenization in NLP?   

                                              answer  
0  Natural Language Processing is a field of AI t...  
1          Python was developed by Guido van Rossum.  
2  A neural network is a set of algorithms modele...  
3  Sentiment analysis is the process of determini...  
4  Tokenization is splitting text into words, phr...  


In [6]:
# WORD TOKENIZATION
print("\nWord Tokenization:")
df['Word_Tokens'] = df['question'].apply(word_tokenize) # Apply word tokenization
print(df[['question', 'Word_Tokens']].head())


Word Tokenization:
                       question                           Word_Tokens
0                  What is NLP?                    [What, is, NLP, ?]
1         Who developed Python?           [Who, developed, Python, ?]
2     What is a neural network?     [What, is, a, neural, network, ?]
3   What is sentiment analysis?    [What, is, sentiment, analysis, ?]
4  What is tokenization in NLP?  [What, is, tokenization, in, NLP, ?]


In [7]:
# SUBWORD TOKENIZATION (BERT Tokenizer)
print("\nSubword Tokenization (BERT):")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Load pre-trained BERT tokenizer
df['Subword_Tokens'] = df['question'].apply(lambda x: tokenizer.tokenize(x)) # Apply subword tokenization
print(df[['question', 'Subword_Tokens']].head())


Subword Tokenization (BERT):


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

                       question                                Subword_Tokens
0                  What is NLP?                        [what, is, nl, ##p, ?]
1         Who developed Python?                   [who, developed, python, ?]
2     What is a neural network?             [what, is, a, neural, network, ?]
3   What is sentiment analysis?            [what, is, sentiment, analysis, ?]
4  What is tokenization in NLP?  [what, is, token, ##ization, in, nl, ##p, ?]


In [8]:
# SENTENCE TOKENIZATION
print("\nSentence Tokenization:")
df['sent_tokenize'] = df['question'].apply(sent_tokenize) # Apply sentence tokenization
print(df[['question', 'sent_tokenize']].head())


Sentence Tokenization:
                       question                   sent_tokenize
0                  What is NLP?                  [What is NLP?]
1         Who developed Python?         [Who developed Python?]
2     What is a neural network?     [What is a neural network?]
3   What is sentiment analysis?   [What is sentiment analysis?]
4  What is tokenization in NLP?  [What is tokenization in NLP?]


In [11]:
# CHARACTER TOKENIZATION
print("\nCharacter Tokenization:")
df['character_tokens'] = df['question'].apply(list)
print(df[['question', 'character_tokens']].head())


Character Tokenization:
                       question  \
0                  What is NLP?   
1         Who developed Python?   
2     What is a neural network?   
3   What is sentiment analysis?   
4  What is tokenization in NLP?   

                                    character_tokens  
0               [W, h, a, t,  , i, s,  , N, L, P, ?]  
1  [W, h, o,  , d, e, v, e, l, o, p, e, d,  , P, ...  
2  [W, h, a, t,  , i, s,  , a,  , n, e, u, r, a, ...  
3  [W, h, a, t,  , i, s,  , s, e, n, t, i, m, e, ...  
4  [W, h, a, t,  , i, s,  , t, o, k, e, n, i, z, ...  
