In [134]:
import pandas as pd
import warnings
import numpy as np
warnings.filterwarnings("ignore")

In [154]:
data = pd.read_csv("/content/twcs.csv", delimiter=",", encoding="utf-8")

In [155]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [166]:
#Function to perform one hot encoding
def one_hot_encoding(text):
  label_encoder = LabelEncoder()
  label_encoded = label_encoder.fit_transform(text.split())
  onehot_encoder = OneHotEncoder()
  label_encoded = label_encoded.reshape(len(label_encoded), 1)
  onehot_encoded = onehot_encoder.fit_transform(label_encoded)
  return onehot_encoded

In [76]:
#Function to perform encoding using BOW
def bow_encoding(text):
  bow = CountVectorizer()
  bow_encoded = bow.fit_transform(text.split('. '))
  return bow_encoded

In [83]:
#Function to perform encoding using BOW
def bigram_bow_encoding(text):
  bow = CountVectorizer(ngram_range=(2,2))
  bow_encoded = bow.fit_transform(text.split('. '))
  return bow_encoded

In [85]:
def trigram_bow_encoding(text):
  bow = CountVectorizer(ngram_range=(3,3))
  bow_encoded = bow.fit_transform(text.split('. '))
  return bow_encoded

In [106]:
def tfidf_encoding(text):
  tfidf = TfidfVectorizer()
  tfidf_encoded = tfidf.fit_transform(text.split('. '))
  return tfidf_encoded

In [171]:
text_encoding_functions = {
    one_hot_encoding: "One Hot Encoding",
    bow_encoding: "Bag-of-words Encoding",
    bigram_bow_encoding: "Bag-of-words Encoding(Bigram)",
    trigram_bow_encoding: "Bag-of-words Encoding(Trigram)",
    tfidf_encoding: "TF-IDF Encoding"
}

In [173]:
for text_encoding_function in text_encoding_functions.keys():
  data_sample = data.head(3)
  data_sample["text"] = data_sample["text"].astype(str).apply(text_encoding_function)
  print(text_encoding_functions[text_encoding_function])
  print("________________________________")
  print(data_sample["text"][0].toarray())
  print("\n")



One Hot Encoding
________________________________
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 



**One hot encoding**

Advantages-


*   One-hot encoding is used in NLP to encode categorical factors as binary vectors, such as words or part-of-speech identifiers
*   This approach is helpful because machine learning algorithms generally act on numerical data, so representing text data as numerical vectors are required for these algorithms to work.
*   In a sentiment analysis assignment, for example, we might describe each word in a sentence as a one-hot encoded vector and then use these vectors as input to a neural network to forecast the sentiment of the sentence.

Disadvantages-



*   One of the major disadvantages of one-hot encoding in NLP is that it produces high-dimensional sparse vectors that can be extremely costly to process
*   Furthermore, because one-hot encoding does not catch the semantic connections between words, machine-learning models that use these vectors as input may perform poorly.









**Bag of words(BOW)**

Advantages-


*   BoW is easy to understand and implement, making it a great starting point for text analysis.


*   It can be used in various NLP tasks like sentiment analysis, document classification, and information retrieval.
*   BoW treats each word independently, making it scalable and efficient for large datasets

Disadvantages-


*   It ignores the order and context of words, losing valuable information about sentence structure and semantics.


*   For larger vocabularies or extensive text datasets, the resulting vectors can become extremely high-dimensional and sparse, impacting computational efficiency.
*   Words not present in the vocabulary are disregarded, leading to information loss.








**N-Grams**

Advantages-


*   The concept of n-grams is simple and easy to use yet powerful. Hence, it can be used to build a variety of applications in NLP, like language models, spelling correctors, etc.

Disadvantages-


*   N-grams cannot deal Out Of Vocabulary (OOV) words. It works well with the words present in the training set. In the case of an Out Of Vocabulary (OOV) word, n-grams fail to tackle it.
*   Another serious concern about n-grams is that it deals with large sparsity.





**TF-IDF**

Advantages-


*   TF-IDF emphasizes the significance of terms that are crucial to individual documents but not necessarily common across the corpus.
*   It’s adaptable to various NLP tasks such as information retrieval, text summarization, and keyword extraction


*   TF-IDF is straightforward to implement and can quickly provide insights into the importance of words in a document.


Disadvantages-


*   In large datasets with numerous unique terms, the TF-IDF matrix can become extremely sparse, consuming more memory.


*   It doesn’t consider the sequence or position of words within a document, potentially losing some contextual information.
*   Depending on how stop words are handled, they might affect the TF-IDF scores disproportionately.




