In [37]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

## ENCODING

- Frequency Based Encoding techniques
1. OHE - One Hot Encoding
2. BOW - Bag Of Words
3. TF-IDF
4. n-gram

### Loading the clean dataset post text pre-processing

In [39]:
df_clean_reviews = pd.read_csv('clean_imdb_reviews.csv')
df_clean_reviews.head()

Unnamed: 0,clean_review,sentiment
0,one reviewer mention watch oz episode youll ho...,positive
1,wonderful little production br br film techniq...,positive
2,think wonderful way spend time hot summer week...,positive
3,basically there family little boy jake think t...,negative
4,petter matteis love time money visually stunni...,positive


In [40]:
## Creating a subset of dataset
no_of_rows = 100
df_review_subset = df_clean_reviews[:no_of_rows]
df_review_subset

Unnamed: 0,clean_review,sentiment
0,one reviewer mention watch oz episode youll ho...,positive
1,wonderful little production br br film techniq...,positive
2,think wonderful way spend time hot summer week...,positive
3,basically there family little boy jake think t...,negative
4,petter matteis love time money visually stunni...,positive
...,...,...
95,daniel daylewis versatile actor alive english ...,positive
96,guess would originally go least two part thus ...,negative
97,well like watch bad horror bmovies cause think...,negative
98,bad movie ever see well worst probably ever se...,negative


### 1. OHE - One Hot Encoding

In [41]:
df_ohe = pd.get_dummies(data=df_review_subset['sentiment'], prefix='sentiment')

In [42]:
df_encoded = pd.concat([df_review_subset, df_ohe], axis=1)

In [43]:
df_encoded.drop('sentiment', axis=1, inplace=True)

In [44]:
df_encoded.head()

Unnamed: 0,clean_review,sentiment_negative,sentiment_positive
0,one reviewer mention watch oz episode youll ho...,0,1
1,wonderful little production br br film techniq...,0,1
2,think wonderful way spend time hot summer week...,0,1
3,basically there family little boy jake think t...,1,0
4,petter matteis love time money visually stunni...,0,1


### 2. BOW - Bag Of Words

In [45]:
# creating function to get BOW

def perform_bow_encoding(df, column_name, n_gram=1):
    BOW = CountVectorizer(ngram_range=(n_gram, n_gram))
    document_matrix = BOW.fit_transform(df[column_name])
    features = BOW.get_feature_names_out()
    print('Features - \n', features)
    
    # converting word frequency vectors into dataframe
    word_freq_df = pd.DataFrame(document_matrix.toarray(), columns=features)
    
    # concatenate word frequency with dataframe
    df_encoded = pd.concat([df, word_freq_df], axis=1)
    
    # drop the original column clean_review
    df_encoded.drop(column_name, axis=1, inplace=True)
    
    return df_encoded
    
    

In [46]:
df_bow = perform_bow_encoding(df_review_subset, 'clean_review')
df_bow.head(1)

Features - 
 ['abbot' 'abbreviated' 'abet' ... 'zoo' 'zoom' 'zwick']


### bi-gram

In [48]:
df_bow_bigram = perform_bow_encoding(df_review_subset, 'clean_review', 2)
df_bow_bigram.head(2)

Features - 
 ['abbot impersonate' 'abbreviated storyline' 'abet superior' ...
 'zoo nothing' 'zoom grainy' 'zwick shame']


Unnamed: 0,sentiment,abbot impersonate,abbreviated storyline,abet superior,abetted mother,abide citizen,abide inoffensive,ability shoot,able make,able stand,...,zellweger help,zerog monkeymidgetcrocodile,zeus follower,zeus hera,zombie closet,zombiebr br,zone gun,zoo nothing,zoom grainy,zwick shame
0,positive,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,positive,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### tri-gram

In [49]:
df_bow_trigram = perform_bow_encoding(df_review_subset, 'clean_review', 3)
df_bow_trigram.head(2)

Features - 
 ['abbot impersonate disappointed' 'abbreviated storyline never'
 'abet superior script' ... 'zoo nothing sit' 'zoom grainy black'
 'zwick shame however']


Unnamed: 0,sentiment,abbot impersonate disappointed,abbreviated storyline never,abet superior script,abetted mother brady,abide citizen know,abide inoffensive insurance,ability shoot gun,able make fun,able stand wrong,...,zellweger help ada,zerog monkeymidgetcrocodile bloodshedbr,zeus follower believe,zeus hera war,zombie closet parent,zombiebr br ok,zone gun rifle,zoo nothing sit,zoom grainy black,zwick shame however
0,positive,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,positive,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### n-gram

In [50]:
df_bow_4_gram = perform_bow_encoding(df_review_subset, 'clean_review', 4)
df_bow_4_gram.head(2)

Features - 
 ['abbot impersonate disappointed teach'
 'abbreviated storyline never completely' 'abet superior script john' ...
 'zoo nothing sit water' 'zoom grainy black white'
 'zwick shame however cold']


Unnamed: 0,sentiment,abbot impersonate disappointed teach,abbreviated storyline never completely,abet superior script john,abetted mother brady arrange,abide citizen know director,abide inoffensive insurance adjusterbr,ability shoot gun jimmy,able make fun plain,able stand wrong first,...,zellweger help ada put,zerog monkeymidgetcrocodile bloodshedbr br,zeus follower believe movie,zeus hera war male,zombie closet parent fight,zombiebr br ok first,zone gun rifle hand,zoo nothing sit water,zoom grainy black white,zwick shame however cold
0,positive,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,positive,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### TF-IDF

In [51]:
# function to perform tf-idf

def perform_tfidf_encoding(df, column):
    tfidf=TfidfVectorizer()

    tfidf.fit_transform(df[column]).toarray()
    print(tfidf.get_feature_names_out())

    print(tfidf.idf_)

In [52]:
perform_tfidf_encoding(df_review_subset,  'clean_review')

['abbot' 'abbreviated' 'abet' ... 'zoo' 'zoom' 'zwick']
[4.92197334 4.92197334 4.92197334 ... 4.92197334 4.92197334 4.92197334]


## OHE

### Advantages
<b>1. Preservation of Information </b>: One-hot encoding preserves all the information present in the categorical variable. Each category is represented by a binary vector where only one element is '1' (hot) and the rest are '0' (cold), indicating the absence of that category.

<b>2. No Assumption of Ordinality</b>: One-hot encoding treats each category as distinct and unrelated to each other. This is particularly useful when there is no inherent order or ranking among the categories. For example, when encoding colors or product categories.

<b>3. Compatibility with Machine Learning Algorithms</b>: Many machine learning algorithms, especially those based on numerical computations like linear regression, logistic regression, and neural networks, require numerical input. One-hot encoding transforms categorical variables into a numerical format that these algorithms can handle.

<b>4. Handling of Missing Values</b>: One-hot encoding can naturally handle missing values in categorical variables. If a categorical value is missing, all elements of the corresponding one-hot encoded vector will be '0', indicating the absence of that category.

<b>5. Interpretability</b>: One-hot encoding makes the data more interpretable, especially when inspecting the feature importance or coefficients of a model. Each binary feature represents the presence or absence of a specific category, making it easier to understand the contribution of each category to the prediction.

<b>6. Avoidance of Bias</b>: One-hot encoding avoids introducing bias by not assuming any ordinal relationship between categories. This ensures that the model treats all categories equally and does not inadvertently assign higher weights to certain categories based on their encoded values.

### Dis-advantages

<b>1. Dimensionality Expansion</b>: One-hot encoding increases the dimensionality of the dataset significantly, especially when dealing with categorical variables with a large number of unique categories. This can lead to the curse of dimensionality, making the dataset more sparse and increasing computational complexity.

<b>2. Memory and Storage Requirements</b>: The increase in dimensionality due to one-hot encoding results in larger memory and storage requirements, as each binary feature adds to the overall size of the dataset. This can become impractical for datasets with a large number of categorical variables or categories.

<b>3. Collinearity Issues</b>: One-hot encoding can introduce collinearity among the encoded features, where certain combinations of binary features are highly correlated. This can negatively impact the performance of some machine learning algorithms, such as linear models, which assume independence among features.

<b>4. Loss of Semantic Information</b>: One-hot encoding treats each category as independent and unrelated, disregarding any inherent semantic relationships or similarities between categories. As a result, it may not capture meaningful associations between related categories, potentially leading to loss of information.

<b>5. Sparse Representation</b>: One-hot encoding produces sparse matrices with mostly '0' values, except for the '1' values corresponding to the presence of specific categories. Sparse representations can be less memory-efficient and may require specialized algorithms for processing.

<b>6. Curse of Dimensionality</b>: The curse of dimensionality refers to the phenomenon where the volume of the feature space increases exponentially with the number of dimensions. One-hot encoding exacerbates this issue by expanding the feature space, potentially leading to overfitting and increased model complexity.

<b>7. Difficulty in Handling New Categories</b>: One-hot encoding requires predefined category mappings, which means that it may encounter difficulties when dealing with new categories not present in the training data. In such cases, additional preprocessing steps are needed to handle unseen categories.

## Bag Of Words

### Advantages:

<b>1. Simplicity</b>: The BoW model is straightforward to understand and implement. It represents text data as a matrix where each row corresponds to a document and each column corresponds to a unique word in the corpus.

<b>2. Flexibility</b>: The BoW model can be used with various text preprocessing techniques, such as tokenization, stemming, and stop word removal. This flexibility allows researchers to customize the model according to their specific needs.

<b>3. Efficiency</b>: BoW representations are efficient to compute and store, especially for sparse matrices where most elements are zeros. This makes BoW suitable for large text datasets and scalable to high-dimensional feature spaces.

<b>4. Interpretability</b>: BoW representations are interpretable, as each feature corresponds to a specific word or term in the vocabulary. This makes it easier to understand the importance of individual words in the text data and interpret the model's predictions.

<b>5. Versatility</b>: BoW representations can be used with a wide range of machine learning algorithms, including linear models, decision trees, and ensemble methods. This versatility makes BoW applicable to various NLP tasks, such as sentiment analysis, document classification, and information retrieval.

### Disadvantages:

<b>1. Loss of Sequence Information</b>: The BoW model disregards the order and context of words in the text data, treating each document as an unordered collection of words. This results in a loss of sequential information, which may be important for tasks like text generation and language modeling.

<b>2. Sparsity</b>: BoW representations often result in high-dimensional and sparse feature vectors, especially for large vocabularies and datasets. This can lead to increased computational complexity and memory requirements, as well as challenges with overfitting.

<b>3. Vocabulary Size</b>: The size of the vocabulary in the BoW model can grow significantly with the size of the corpus, resulting in large feature spaces and potential difficulties with handling out-of-vocabulary words.

<b>4. Semantic Ambiguity</b>: BoW representations may struggle to capture the semantic meaning of words and phrases, especially in cases of polysemy (multiple meanings) and synonymy (multiple words with similar meanings). This can lead to ambiguities and inaccuracies in the model's predictions.

<b>5. Ignoring Word Frequency</b>: The BoW model treats all words as equally important and disregards differences in word frequency and importance. This can lead to suboptimal representations, especially for tasks where word frequency or importance is relevant, such as keyword extraction and topic modeling.

## TF-IDF

### Advantages:

<b>1. Term Importance</b>: TF-IDF measures the importance of a term in a document relative to its importance in the entire corpus. Terms that appear frequently in a document but infrequently in the corpus are assigned higher weights, indicating their significance in representing the document's content.

<b>2. Normalization</b>: TF-IDF normalizes term frequencies by taking into account the inverse document frequency, which helps mitigate the influence of commonly occurring terms across documents. This normalization ensures that the representation is robust to variations in document length and corpus size.

<b>3. Discriminative Power</b>: TF-IDF emphasizes terms that are discriminative or characteristic of specific documents, making it well-suited for tasks like document classification, clustering, and information retrieval. It helps distinguish between documents based on their unique content and topics.

<b>4. Sparse Representation</b>: TF-IDF produces sparse feature vectors where most elements are zeros, similar to the Bag of Words (BoW) model. This sparsity makes the representation memory-efficient and computationally tractable, especially for large text datasets with extensive vocabularies.

<b>5. Interpretability</b>: TF-IDF representations are interpretable, as each feature corresponds to a specific term in the vocabulary. This makes it easier to understand the importance of individual terms in the text data and interpret the model's predictions.

<b>6. Versatility</b>: TF-IDF can be combined with various text preprocessing techniques, such as tokenization, stemming, and stop word removal, to customize the representation according to specific requirements. It can also be used with a wide range of machine learning algorithms, making it applicable to diverse NLP tasks.

### Disadvantages:

<b>1. Lack of Context</b>: Like the Bag of Words (BoW) model, TF-IDF disregards the order and context of terms in the text data, treating each document as an unordered collection of terms. This may result in a loss of sequential information, especially for tasks that require modeling word sequences, such as text generation and language modeling.

<b>2. Vocabulary Size</b>: The size of the vocabulary in the TF-IDF model can grow significantly with the size of the corpus, resulting in large feature spaces and potential difficulties with handling out-of-vocabulary terms. This can increase computational complexity and memory requirements, especially for datasets with extensive vocabularies.

<b>3. Sensitivity to Stop Words</b>: TF-IDF may be sensitive to the presence of stop words, which are common words that often appear in documents but may not carry significant semantic meaning (e.g., "the," "and," "of"). Depending on the task and dataset, it may be necessary to customize the stop word list or apply additional preprocessing steps to address this sensitivity.

<b>4. Semantic Ambiguity</b>: TF-IDF may struggle to capture the semantic meaning of terms, especially in cases of polysemy (multiple meanings) and synonymy (multiple terms with similar meanings). This can lead to ambiguities and inaccuracies in the model's predictions, especially for tasks that require understanding semantic relationships between terms.

<b>5. Difficulty with Rare Terms</b>: TF-IDF may assign low weights to rare terms that appear in only a few documents, which may be informative or important for specific tasks. This can result in underrepresentation of rare terms in the model's representation, potentially affecting performance on tasks like document similarity and clustering.