In [None]:
# Import libraries here that you need for different processing steps
import nltk
import csv
import spacy
import pandas as pd

## Text Representation

### Word-to-vector

You can implement any or many of these approaches to try out which one produces the best results ultimately.

In [None]:
# Let's read a dataframe for understanding text-representation
data_df = pd.read_csv("Dataset/covid.csv")

print ("Data set: ", len(data_df))

display(data_df)

### One-Hot Encoding

In [None]:
# Explaining one-hot-encoding using one of the instances from the dataset. 
# Remember we talked about the sparsity in this approach and how this could be bad design for very large datasets.

from sklearn.preprocessing import OneHotEncoder
import itertools
import numpy as np

docs = data_df.iloc[6]["OriginalTweet"].split()
print(len(docs), docs, "\n")

# split documents to tokens
tokens_docs = [doc.split(" ") for doc in docs]

# convert list of token-lists to one flat list of tokens
# and then create a dictionary that maps word to id of word,

# For large-scale dataset
# all_tokens = itertools.chain.from_iterable(tokens_docs)

# More intuitive way, but not optimal for large-scale input
all_tokens = [token for doc in tokens_docs for token in doc]

word_to_id = {token: idx for idx, token in enumerate(set(all_tokens))}
print(len(word_to_id), word_to_id, "\n")

# convert token lists to token-id lists
token_ids = [[word_to_id[token] for token in tokens_doc] for tokens_doc in tokens_docs]

# convert list of token-id lists to one-hot representation
vec = OneHotEncoder(categories="auto")
X = vec.fit_transform(token_ids)
X = X.toarray()
X
# print(X.toarray())

### Bag Of Words BOW- CountVectorizer

In [None]:
# As discussed about the CountVectorizer, below is implemented for only a set of instances for exploration 

from sklearn.feature_extraction.text import CountVectorizer

# text = ["i love nlp. nlp is so cool"]
text = data_df.iloc[1:4]["OriginalTweet"]
print(text, "\n")
vectorizer = CountVectorizer()

# tokenize and build vocab
vectorizer.fit(text)

print(vectorizer.vocabulary_, "\n")
# encode document

vector = vectorizer.transform(text)
# summarize encoded vector

print(vector.shape, "\n") 
print(vector.toarray())

### Term Frequency- Inverse Document Frequency (TF-IDF) 

In [None]:
# As discussed about the TF-IDF, below is implemented for only a set of instances for exploration 

from sklearn.feature_extraction.text import TfidfVectorizer

# For more simple examples:
# text1 = ['i love nlp', 'nlp is so cool', 
# 'nlp is all about helping machines process language', 
# 'this tutorial is on basic nlp technique']
# print(text1)

text1 = data_df.iloc[1:6]["OriginalTweet"].tolist()
print(text1)

tf = TfidfVectorizer()
txt_fitted = tf.fit(text1)
txt_transformed = txt_fitted.transform(text1)

idf = tf.idf_
print(dict(zip(txt_fitted.get_feature_names_out(), idf)))

### UniGrams, BiGrams, TriGrams

In [None]:
from nltk.util import ngrams
def ngram_convertor(sentence, n=3):
    ngram_sentence = ngrams(sentence.split(), n)
    for item in ngram_sentence:
        print(item)

In [None]:
# Call this function with different values of sentence and N to see the grams building from the sentence.
sentence = data_df.iloc[2]["OriginalTweet"]
print(sentence)
N = 3
ngram_convertor(sentence,N)

## Discussion Questions

- Between one-hot encoding, count-based bag-of-words, and TF-IDF, which representation would you expect to work better for document classification, and why?
    - Do you think any of these representations is always superior to the others across different NLP tasks and datasets? Why or why not?
- TF-IDF relies on statistics from the training corpus. What potential issues might arise when applying the model to unseen documents? How could these issues be mitigated?