## Dependencies

In [1]:
import re
import nltk
import spacy
import emoji
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from pathlib import Path
from sklearn.svm import SVC
from textblob import TextBlob
from scipy.sparse import hstack
from wordcloud import WordCloud
from collections import Counter
import matplotlib.pyplot as plt
from scipy.sparse import spmatrix
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from lightgbm import LGBMClassifier
from sklearn.base import BaseEstimator
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.base import TransformerMixin
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import mutual_info_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


# Ignore all runtime warnings
warnings.filterwarnings('ignore')


## Load the dataset

In [2]:
imdb_ratings_df = pd.read_csv(filepath_or_buffer = '../data/IMDB_Dataset.csv',
                              index_col          = None)

imdb_ratings_df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


## Text Preprocessing

In [3]:
class TextPreprocessor:
    """
    A class for preprocessing text data through cleaning, tokenization, and normalization
    
    Attributes:
    -----------
        lemmatizer : WordNetLemmatizer instance for word lemmatization
        
        stop_words : Set of stopwords to be removed from text
    """ 
    def __init__(self):
        """
        Initialize the TextPreprocessor with required NLTK resources
        
        Raises:
        -------
            LookupError : If required NLTK resources cannot be downloaded
        """
        try:
            # Download required NLTK data
            nltk.download('punkt', quiet=True)
            nltk.download('stopwords', quiet=True)
            nltk.download('wordnet', quiet=True)
            nltk.download('punkt_tab', quiet=True)
            
            self.lemmatizer = WordNetLemmatizer()
            self.stop_words = set(stopwords.words('english'))
            
        except LookupError as e:
            raise
    
    def clean_text(self, text:str) -> str:
        """
        Clean and normalize input text by removing HTML tags, special characters,
        and applying text normalization techniques
        
        Arguments:
        ----------
            text { str }      : Input text to be cleaned
            
        Raises:
        -------
            ValueError        : If input text is None or empty
            
            TextCleaningError : If any error occurs at any step of text cleaning process
            
        Returns:
        --------
                { str }       : Cleaned and normalized text
        """
        if ((not text) or (not isinstance(text, str))):
            raise ValueError("Input text must be a non-empty string")
            
        try:
            # Remove HTML tags
            text   = re.sub('<[^>]*>', '', text)
            
            # Remove special characters and digits
            text   = re.sub('[^a-zA-Z\s]', '', text)
            
            # Convert to lowercase
            text   = text.lower()
            
            # Tokenization
            tokens = word_tokenize(text)
            
            # Remove stopwords and lemmatize
            tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stop_words]
            
            return ' '.join(tokens)
        
        except Exception as TextCleaningError:
            raise

In [4]:
# Initialize the preprocessor
preprocessor                  = TextPreprocessor()

# Add a new column to the original DataFrame to store the cleaned texts
imdb_ratings_df['clean_text'] = imdb_ratings_df['review'].apply(preprocessor.clean_text)

In [5]:
# Ratings After Cleaning
imdb_ratings_df.head(10)


Unnamed: 0,review,sentiment,clean_text
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching oz episode you...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically there family little boy jake think t...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...
5,"Probably my all-time favorite movie, a story o...",positive,probably alltime favorite movie story selfless...
6,I sure would like to see a resurrection of a u...,positive,sure would like see resurrection dated seahunt...
7,"This show was an amazing, fresh & innovative i...",negative,show amazing fresh innovative idea first aired...
8,Encouraged by the positive comments about this...,negative,encouraged positive comment film looking forwa...
9,If you like original gut wrenching laughter yo...,positive,like original gut wrenching laughter like movi...


## Exploratyory Data Analysis (EDA)

In [6]:
class SentimentEDA:
    """
    A class for comprehensive Exploratory Data Analysis of sentiment-based text data
    
    Attributes:
    -----------
        df            { DataFrame } : DataFrame containing text and sentiment data
        
        text_column      { str }    : Name of column containing cleaned text
        
        sentiment_column { str }    : Name of column containing sentiment labels
    """
    
    def __init__(self, df: pd.DataFrame, text_column:str = 'clean_text', sentiment_column:str = 'sentiment',
                 output_dir:str = None) -> None:
        """
        Initialize the SentimentEDA class
        
        Arguments:
        ----------
            df           { DataFrame } : Input DataFrame
            
            text_column      { str }   : Name of text column
            
            sentiment_column { str }   : Name of sentiment column
            
            output_dir       { str }   : Directory to save plots
            
        Raises:
        -------
            ValueError                 : If required columns are not in DataFrame
        """
        if ((text_column not in df.columns) or (sentiment_column not in df.columns)):
            raise ValueError(f"DataFrame must contain columns: {text_column} and {sentiment_column}")
        
        # Initialize the Attributes
        self.df               = df
        self.text_column      = text_column
        self.sentiment_column = sentiment_column
        self.output_dir       = output_dir
        
        if output_dir:
            Path(output_dir).mkdir(parents  = True, 
                                   exist_ok = True)
            
        
        
    def save_plot(self, plt: plt, plot_name: str) -> None:
        """
        Helper method to save plots if output directory is specified
        
        """
        if self.output_dir:
            plt.savefig(fname       = f"{self.output_dir}/{plot_name}.png", 
                        bbox_inches = 'tight', 
                        dpi         = 300)
            
            # Print the success statement on screen
            print(f"Saved plot: {plot_name}")
    
    
    def text_length_analysis(self) -> None:
        """
        Analyze and visualize text length distributions
        
        - Character length distribution
        
        - Word length distribution
        
        - Sentence length distribution
        
        """
        try:
            # Calculate lengths
            self.df['char_length']     = self.df[self.text_column].str.len()
            self.df['word_length']     = self.df[self.text_column].str.split().str.len()
            self.df['sentence_length'] = self.df[self.text_column].str.split('.').str.len()
            
            # Create subplots
            fig, axes                  = plt.subplots(nrows   = 3, 
                                                      ncols   = 1, 
                                                      figsize = (12, 15))
            
            # Character length
            sns.boxplot(x    = self.sentiment_column, 
                        y    = 'char_length', 
                        data = self.df, 
                        ax   = axes[0])
            
            axes[0].set_title('Character Length Distribution by Sentiment')
            axes[0].set_ylabel('Number of Characters')
            
            # Word length
            sns.boxplot(x    = self.sentiment_column, 
                        y    = 'word_length', 
                        data = self.df, 
                        ax   = axes[1])
            
            axes[1].set_title('Word Length Distribution by Sentiment')
            axes[1].set_ylabel('Number of Words')
            
            # Sentence length
            sns.boxplot(x    = self.sentiment_column, 
                        y    = 'sentence_length', 
                        data = self.df, 
                        ax   = axes[2])
            
            axes[2].set_title('Sentence Length Distribution by Sentiment')
            axes[2].set_ylabel('Number of Sentences')
            
            plt.tight_layout()
            self.save_plot(plt, 'length_distributions')
            plt.close()
            
            # Print summary statistics
            stats = self.df.groupby(self.sentiment_column)[['char_length', 
                                                            'word_length', 
                                                            'sentence_length']].describe()
            
            print("\nLength Statistics by Sentiment:")
            print(f"\n{stats}")
            
        except Exception as e:
            raise
            
            
    def word_frequency_analysis(self, top_n: int = 20) -> None:
        """
        Analyze and visualize word frequencies by sentiment
        
        Arguments:
        ----------
            top_n { int } : Number of top words to display
        """
        try:
            # Split by sentiment
            sentiment_texts  = {sentiment: ' '.join(group[self.text_column]) for sentiment, group \
                                in self.df.groupby(self.sentiment_column)}
            
            # Create word frequency plots for each sentiment
            fig, axes        = plt.subplots(nrows   = len(sentiment_texts), 
                                            ncols   = 1, 
                                            figsize = (12, 5*len(sentiment_texts)))
            
            for idx, (sentiment, text) in enumerate(sentiment_texts.items()):
                words        = text.split()
                word_freq    = Counter(words).most_common(top_n)
                words, freqs = zip(*word_freq)
                
                ax           = axes[idx] if len(sentiment_texts) > 1 else axes
                
                sns.barplot(x  = list(freqs), 
                            y  = list(words), 
                            ax = ax)
                
                ax.set_title(f'Top {top_n} Words for {sentiment} Sentiment')
                ax.set_xlabel('Frequency')
                
            plt.tight_layout()
            self.save_plot(plt, 'word_frequencies')
            plt.close()
            
        except Exception as e:
            raise
            
            
    def generate_wordclouds(self) -> None:
        """
        Generate and display wordclouds for each sentiment category
        
        """
        try:
            # Create wordcloud for each sentiment
            sentiment_texts = {sentiment: ' '.join(group[self.text_column]) for sentiment, group in self.df.groupby(self.sentiment_column)}
            
            fig, axes       = plt.subplots(1, len(sentiment_texts), figsize=(15, 8))
            
            for idx, (sentiment, text) in enumerate(sentiment_texts.items()):
                wordcloud = WordCloud(width            = 800, 
                                      height           = 400,
                                      background_color = 'white',
                                      max_words        = 150).generate(text)
                
                ax        = axes[idx] if len(sentiment_texts) > 1 else axes
                
                ax.imshow(wordcloud, interpolation='bilinear')
                ax.axis('off')
                ax.set_title(f'{sentiment} Sentiment WordCloud')
                
            plt.tight_layout()
            self.save_plot(plt, 'wordclouds')
            plt.close()
            
        except Exception as e:
            raise
            
            
    def sentiment_intensity_analysis(self) -> None:
        """
        Analyze sentiment intensity using TextBlob's polarity scores
        """
        try:
            # Calculate polarity scores
            self.df['polarity'] = self.df[self.text_column].apply(lambda x: TextBlob(x).sentiment.polarity)
            
            # Create distribution plot
            plt.figure(figsize = (10, 6))
            
            sns.kdeplot(data = self.df, 
                        x    = 'polarity', 
                        hue  = self.sentiment_column)
            
            plt.title('Sentiment Polarity Distribution')
            plt.xlabel('Polarity Score')
            plt.ylabel('Density')

            self.save_plot(plt, 'sentiment_intensity')
            plt.close()
            
            # Print summary statistics
            stats = self.df.groupby(self.sentiment_column)['polarity'].describe()
            print("\nPolarity Statistics by Sentiment:")
            print(f"\n{stats}")
            
        except Exception as e:
            raise
        
        
    def pos_distribution_analysis(self) -> None:
        """
        Analyze distribution of Parts of Speech across sentiments
        """
        try:
            nlp = spacy.load('en_core_web_sm')
            
            def get_pos_counts(text: str):
                doc = nlp(text)
                return Counter([token.pos_ for token in doc])
            
            # Get POS counts for sample of texts (for efficiency)
            sample_size                   = min(1000, len(self.df))
            sample_df                     = self.df.sample(sample_size, random_state=42)
            
            pos_counts                    = sample_df[self.text_column].apply(get_pos_counts)
            pos_df                        = pd.DataFrame(pos_counts.tolist())
            pos_df[self.sentiment_column] = sample_df[self.sentiment_column]
            
            # Create visualization
            pos_melted                    = pos_df.melt(id_vars    = [self.sentiment_column],
                                                        var_name   = 'POS',
                                                        value_name = 'Count')
            
            plt.figure(figsize = (12, 6))
            sns.boxplot(x    = 'POS', 
                        y    = 'Count', 
                        hue  = self.sentiment_column, 
                        data = pos_melted)
            
            plt.xticks(rotation = 45)
            plt.title('Distribution of Parts of Speech by Sentiment')
    
            self.save_plot(plt, 'pos_distribution')
            plt.close()
            
        except Exception as e:
            raise
            
            
    def analyze_readability(self) -> None:
        """
        Analyze text readability metrics
        """
        try:
            def calculate_readability(text: str):
                words               = text.split()
                sentences           = text.split('.')
                
                # Rough approximation
                syllables           = sum([len(word)/3 for word in words])  
                
                # Calculate metrics
                avg_word_length     = np.mean([len(word) for word in words])
                avg_sentence_length = len(words) / len(sentences)
                flesch_reading_ease = 206.835 - 1.015 * avg_sentence_length - 84.6 * (syllables / len(words))
                
                return {'avg_word_length': avg_word_length,
                        'avg_sentence_length': avg_sentence_length,
                        'flesch_reading_ease': flesch_reading_ease
                       }
            
            # Calculate readability metrics
            readability_scores                    = self.df[self.text_column].apply(calculate_readability)
            readability_df                        = pd.DataFrame(readability_scores.tolist())
            readability_df[self.sentiment_column] = self.df[self.sentiment_column]
            
            # Create visualizations
            fig, axes                             = plt.subplots(nrows   = 3, 
                                                                 ncols   = 1, 
                                                                 figsize = (10, 15))
            
            for idx, metric in enumerate(readability_df.columns[:-1]):
                sns.boxplot(x    = self.sentiment_column,
                            y    = metric,
                            data = readability_df,
                            ax   = axes[idx])
                
                axes[idx].set_title(f'{metric.replace("_", " ").title()} by Sentiment')
                
            plt.tight_layout()
            self.save_plot(plt, 'readability_metrics')
            plt.close()
            
        except Exception as e:
            raise
         
        
    def run_full_eda(self) -> None:
        """
        Run all EDA analyses
        
        """
        analyses = [self.text_length_analysis,
                    self.word_frequency_analysis,
                    self.generate_wordclouds,
                    self.sentiment_intensity_analysis,
                    self.pos_distribution_analysis,
                    self.analyze_readability
                   ]
        
        for analysis in analyses:
            print(f"Running {analysis.__name__}...")
            analysis()
            
        print("EDA pipeline completed successfully")
        

### Initialize the EDA class

In [7]:
eda = SentimentEDA(df               = imdb_ratings_df,
                   text_column      = 'clean_text',
                   sentiment_column = 'sentiment',
                   output_dir       = '../results/EDA_Results/')

# Or run all analyses at once
#eda.run_full_eda()

### Text Length Analysis

In [None]:
eda.text_length_analysis()


### Word Frequency Analysis

In [None]:
eda.word_frequency_analysis(top_n = 25)


### WordClouds

In [None]:
eda.generate_wordclouds()


### Sentiment Intensity Analysis

In [None]:
eda.sentiment_intensity_analysis()


### POS Distribution Analysis

In [None]:
eda.pos_distribution_analysis()


### Readability Analysis

In [None]:
eda.analyze_readability()


## Feature Engineering

In [None]:
class TextFeatureEngineering:
    """
    A class for implementing various text feature engineering techniques
    
    Attributes:
    -----------
        texts        { list }  : List of preprocessed text documents
        
        max_features  { int }  : Maximum number of features to create
        
        ngram_range  { tuple } : Range of n-grams to consider
    """
    
    def __init__(self, texts: list, max_features: int = None, ngram_range: tuple = (1, 3)) -> None:
        """
        Initialize TextFeatureEngineering with texts and parameters
        
        Arguments:
        ----------
            texts        : List of preprocessed text documents
            
            max_features : Maximum number of features (None for no limit)
            
            ngram_range  : Range of n-grams to consider (min_n, max_n)
            
        Raises:
        -------
            ValueError   : If texts is empty or parameters are invalid
        """
        if not texts:
            raise ValueError("Input texts cannot be empty")
            
        self.texts        = texts
        self.max_features = max_features
        self.ngram_range  = ngram_range
        
        
    def create_binary_bow(self) -> tuple:
        """
        Create binary bag-of-words features (presence/absence)
        
        Returns:
        --------
            { tuple } : Tuple containing: - Fitted CountVectorizer
                                          - Binary document-term matrix
        """
        try:
            print("Creating binary bag-of-words features...")
            vectorizer = CountVectorizer(binary       = True,
                                         max_features = self.max_features,
                                         ngram_range  = self.ngram_range)
            
            features   = vectorizer.fit_transform(self.texts)
            print(f"Created {features.shape[1]} binary features")
            
            return vectorizer, features
            
        except Exception as e:
            raise
            
            
    def create_count_bow(self) -> tuple:
        """
        Create count-based bag-of-words features
        
        Returns:
        --------
            { tuple } : Tuple containing: - Fitted CountVectorizer
                                          - Count document-term matrix
        """
        try:
            print("Creating count-based bag-of-words features...")
            vectorizer = CountVectorizer(max_features = self.max_features,
                                         ngram_range  = self.ngram_range)
            
            features   = vectorizer.fit_transform(self.texts)
            print(f"Created {features.shape[1]} count-based features")
            
            return vectorizer, features
            
        except Exception as e:
            raise
            
            
    def create_frequency_bow(self) -> tuple:
        """
        Create frequency-based bag-of-words features (term frequency)
        
        Returns:
        --------
            { tuple } : Tuple containing: - Fitted TfidfVectorizer
                                          - Term frequency document-term matrix
        """
        try:
            print("Creating frequency-based bag-of-words features...")
            
            vectorizer = TfidfVectorizer(use_idf      = False,
                                         max_features = self.max_features,
                                         ngram_range  = self.ngram_range)
            
            features   = vectorizer.fit_transform(self.texts)
            print(f"Created {features.shape[1]} frequency-based features")
            
            return vectorizer, features
            
        except Exception as e:
            raise
            
            
    def create_tfidf(self) -> tuple:
        """
        Create TF-IDF features
        
        Returns:
        --------
            { tuple } : Tuple containing: - Fitted TfidfVectorizer
                                          - TF-IDF document-term matrix
        """
        try:
            print("Creating TF-IDF features...")
            vectorizer = TfidfVectorizer(max_features = self.max_features,
                                         ngram_range  = self.ngram_range)
            
            features   = vectorizer.fit_transform(self.texts)
            print(f"Created {features.shape[1]} TF-IDF features")
            
            return vectorizer, features
            
        except Exception as e:
            raise
            
            
    def create_standardized_tfidf(self) -> tuple:
        """
        Create Standardized TF-IDF features
        
        Returns:
        --------
            { tuple } : Tuple containing: - Fitted TfidfVectorizer
                                          - Standardized TF-IDF document-term matrix
        """
        try:
            print("Creating Standardized TF-IDF features...")
            vectorizer          = TfidfVectorizer(max_features = self.max_features, 
                                                  ngram_range  = self.ngram_range)
            
            tfidf_matrix        = vectorizer.fit_transform(self.texts)
            
            scaler              = StandardScaler(with_mean = False)
            
            standardized_matrix = scaler.fit_transform(tfidf_matrix)
            
            print(f"Created {standardized_matrix.shape[1]} standardized TF-IDF features")
            return vectorizer, standardized_matrix
            
        except Exception as e:
            raise
            
            
    def _create_bm25_variant(self, variant: str, k1: float = 1.5, b: float = 0.75, delta: float = 1.0) -> tuple:
        """
        Unified private method to create BM25 variant features.

        Arguments:
        ----------
            variant : Specify the BM25 variant ("BM25", "BM25F", "BM25L", "BM25+", "BM25T")
            k1      : Term frequency saturation parameter (default: 1.5)
            b       : Length normalization parameter (default: 0.75)
            delta   : Free parameter for certain variants (default: 1.0)

        Returns:
        --------
            { tuple } : Tuple containing:
                        - Custom transformer for the specified BM25 variant
                        - BM25 variant document-term matrix
        """
        try:
            print(f"Creating {variant} features...")

            class BM25VariantTransformer(BaseEstimator, TransformerMixin):
                def __init__(self, k1=1.5, b=0.75, delta=1.0, variant="BM25"):
                    self.k1               = k1
                    self.b                = b
                    self.delta            = delta
                    self.variant          = variant
                    self.count_vectorizer = CountVectorizer()

                def fit(self, texts):
                    # Calculate IDF and average document length
                    X                   = self.count_vectorizer.fit_transform(texts)
                    self.avg_doc_length = X.sum(axis=1).mean()
                    n_docs              = len(texts)
                    df                  = np.bincount(X.indices, minlength=X.shape[1])
                    self.idf            = np.log((n_docs - df + 0.5) / (df + 0.5) + 1)
                    return self

                def transform(self, texts):
                    # Calculate BM25 variant scores
                    X           = self.count_vectorizer.transform(texts)
                    doc_lengths = X.sum(axis=1).A1
                    rows, cols  = X.nonzero()
                    data        = list()

                    for i, j in zip(rows, cols):
                        tf = X[i, j]
                        if (self.variant == "BM25"):
                            numerator   = tf * (self.k1 + 1)
                            denominator = tf + self.k1 * (1 - self.b + self.b * doc_lengths[i] / self.avg_doc_length)
                            score       = self.idf[j] * numerator / denominator
                        
                        elif self.variant == "BM25F":
                            score = self.idf[j] * (tf / (self.k1 + tf))
                            
                        elif (self.variant == "BM25L"):
                            numerator   = tf + self.delta
                            denominator = tf + self.delta + self.k1 * (1 - self.b + self.b * doc_lengths[i] / self.avg_doc_length)
                            score       = self.idf[j] * numerator / denominator
                        
                        elif (self.variant == "BM25+"):
                            numerator   = tf + self.delta
                            denominator = tf + self.k1
                            score       = self.idf[j] * numerator / denominator
                        
                        elif (self.variant == "BM25T"):
                            score = self.idf[j] * (tf * np.log(1 + tf))
                        
                        else:
                            raise ValueError(f"Unknown variant: {self.variant}")
                        
                        data.append(score)

                    return csr_matrix((data, (rows, cols)), shape=X.shape)

            transformer = BM25VariantTransformer(k1      = k1, 
                                                 b       = b, 
                                                 delta   = delta, 
                                                 variant = variant)
            features    = transformer.fit_transform(self.texts)
            print(f"Created {features.shape[1]} {variant} features")
            return transformer, features

        except Exception as e:
            raise

    def create_bm25(self, k1: float = 1.5, b: float = 0.75) -> tuple:
        """
        Create BM25 features.
        """
        return self._create_bm25_variant(variant="BM25", k1=k1, b=b)


    def create_bm25f(self, k1: float = 1.5) -> tuple:
        """
        Create BM25F features.
        """
        return self._create_bm25_variant(variant="BM25F", k1=k1)


    def create_bm25l(self, k1: float = 1.5, b: float = 0.75, delta: float = 1.0) -> tuple:
        """
        Create BM25L features.
        """
        return self._create_bm25_variant(variant="BM25L", k1=k1, b=b, delta=delta)


    def create_bm25_plus(self, k1: float = 1.5, delta: float = 1.0) -> tuple:
        """
        Create BM25+ features.
        """
        return self._create_bm25_variant(variant="BM25+", k1=k1, delta=delta)


    def create_bm25t(self, k1: float = 1.5) -> tuple:
        """
        Create BM25T features.
        """
        return self._create_bm25_variant(variant="BM25T", k1=k1)


    def create_skipgrams(self, k: int = 2) -> tuple:
        """
        Create skipgram features
        
        Arguments:
        ----------
            k { int } : Skip distance
            
        Returns:
        --------
            { tuple } : Tuple containing: - Fitted CountVectorizer for skipgrams
                                          - Skipgram document-term matrix
        """
        try:
            print("Creating skipgram features...")
            
            def generate_skipgrams(text: str) -> str:
                words     = text.split()
                skipgrams = list()
                
                for i in range(len(words) - k - 1):
                    skipgram = f"{words[i]}_{words[i + k + 1]}"
                    skipgrams.append(skipgram)
                    
                return ' '.join(skipgrams)
            
            processed_texts = [generate_skipgrams(text) for text in self.texts]
            
            vectorizer      = CountVectorizer(max_features=self.max_features)
            features        = vectorizer.fit_transform(processed_texts)
            
            print(f"Created {features.shape[1]} skipgram features")
            return vectorizer, features
            
        except Exception as e:
            raise
            
            
    def create_positional_ngrams(self) -> tuple:
        """
        Create positional n-gram features
        
        Returns:
        --------
            { tuple } : Tuple containing: - Fitted CountVectorizer for positional n-grams
                                          - Positional n-gram document-term matrix
        """
        try:
            print("Creating positional n-gram features...")
            
            def generate_positional_ngrams(text: str) -> str:
                words      = text.split()
                pos_ngrams = list()
                
                for i in range(len(words)):
                    for n in range(self.ngram_range[0], min(self.ngram_range[1] + 1, len(words) - i + 1)):
                        ngram     = '_'.join(words[i:i+n])
                        pos_ngram = f"pos{i}_{ngram}"
                        pos_ngrams.append(pos_ngram)
                        
                return ' '.join(pos_ngrams)
            
            processed_texts = [generate_positional_ngrams(text) for text in self.texts]
            
            vectorizer      = CountVectorizer(max_features = self.max_features)
            
            features        = vectorizer.fit_transform(processed_texts)
            
            print(f"Created {features.shape[1]} positional n-gram features")
            return vectorizer, features
            
        except Exception as e:
            raise
            
            
    def create_all_features(self) -> dict:
        """
        Create all available feature types
        
        Returns:
        --------
            { dict } : Dictionary mapping feature names to their vectorizer and feature matrix
        """
        try:
            print("Creating all feature types...")
            features                      = dict()
            
            # Create all feature types
            features['binary_bow']        = self.create_binary_bow()
            features['count_bow']         = self.create_count_bow()
            features['frequency_bow']     = self.create_frequency_bow()
            features['tfidf']             = self.create_tfidf()
            features['bm25']              = self.create_bm25()
            features['skipgrams']         = self.create_skipgrams()
            features['positional_ngrams'] = self.create_positional_ngrams()
            
            print("Created all feature types successfully")
            return features
            
        except Exception as e:
            raise

### Initialize the feature engineering class

In [None]:
feature_eng = TextFeatureEngineering(texts        = imdb_ratings_df['clean_text'].tolist(),
                                     max_features = None,
                                     ngram_range  = (1, 3)
                                    )

### Create specific feature types

In [None]:
# Generate feature matrices
#count_vectorizer, count_features          = feature_eng.create_count_bow()
#freq_vectorizer, freq_features            = feature_eng.create_frequency_bow()
#binary_vectorizer, binary_features        = feature_eng.create_binary_bow()
#tfidf_vectorizer, tfidf_features          = feature_eng.create_tfidf()
std_tfidf_vectorizer, std_tfidf_features  = feature_eng.create_standardized_tfidf()
#bm25_transformer, bm25_features           = feature_eng.create_bm25()
bm25f_transformer, bm25f_features         = feature_eng.create_bm25f()
#bm25l_transformer, bm25l_features         = feature_eng.create_bm25l()
#bm25t_transformer, bm25t_features         = feature_eng.create_bm25t()
bm25_plus_transformer, bm25_plus_features = feature_eng.create_bm25_plus()
skipgrams_vectorizer, skipgram_features   = feature_eng.create_skipgrams()
pos_ngram_vectorizer, pos_ngram_features  = feature_eng.create_positional_ngrams()

# Combine feature matrices
combined_features                         = hstack([#count_features, 
                                                    #freq_features, 
                                                    #binary_features, 
                                                    #tfidf_features, 
                                                    std_tfidf_features,
                                                    #bm25_features,
                                                    bm25f_features,
                                                    #bm25l_features,
                                                    #bm25t_features,
                                                    bm25_plus_features,
                                                    skipgram_features,
                                                    pos_ngram_features])

# Combine feature names
feature_names                             = (#list(count_vectorizer.get_feature_names_out()) +
                                             #list(freq_vectorizer.get_feature_names_out()) +
                                             #list(binary_vectorizer.get_feature_names_out()) +
                                             #list(tfidf_vectorizer.get_feature_names_out()) +
                                             list(std_tfidf_vectorizer.get_feature_names_out()) +
                                             #list(bm25_transformer.count_vectorizer.get_feature_names_out()) +
                                             list(bm25f_transformer.count_vectorizer.get_feature_names_out()) +
                                             #list(bm25l_transformer.count_vectorizer.get_feature_names_out()) +
                                             #list(bm25t_transformer.count_vectorizer.get_feature_names_out()) +
                                             list(bm25_plus_transformer.count_vectorizer.get_feature_names_out()) +
                                             list(skipgrams_vectorizer.get_feature_names_out()) +
                                             list(pos_ngram_vectorizer.get_feature_names_out())
                                            )


In [None]:
# Or create all feature types at once
# all_features = feature_eng.create_all_features()


## Feature Selection

In [None]:
class TextFeatureSelector:
    """
    A class for implementing various feature selection techniques for text data
    
    Attributes:
    -----------
        X           { spmatrix } : Feature matrix
        
        y           { ndarray }  : Target labels

        feature_names { list }   : Names of features
        
        n_features    { int }    : Number of features to select
    """
    
    def __init__(self, X: spmatrix, y: np.ndarray, feature_names: list, n_features: int = None) -> None:
        """
        Initialize TextFeatureSelector with feature matrix and labels
        
        Arguments:
        ----------
            X             : Sparse feature matrix
            
            y             : Target labels
            
            feature_names : List of feature names
            
            n_features    : Number of features to select (default: 10% of features)
            
        Raises:
        -------
            ValueError    : If inputs are invalid or incompatible
        """
        if (X.shape[0] != len(y)):
            raise ValueError("Number of samples in X and y must match")
            
        if (X.shape[1] != len(feature_names)):
            raise ValueError("Number of features must match length of feature_names")
            
        self.X             = X
        self.y             = y
        self.feature_names = feature_names
        self.n_features    = n_features or int(0.1 * X.shape[1])  # Default to 10% of features
        
        
    def chi_square_selection(self) -> tuple:
        """
        Perform chi-square feature selection
        
        Returns:
        --------
            { tuple } : Tuple containing: - Selected feature indices
                                          - Chi-square scores
        """
        try:
            print("Performing chi-square feature selection...")
            
            # Scale features to non-negative for chi-square
            scaler            = MinMaxScaler()
            X_scaled          = scaler.fit_transform(self.X.toarray())
            
            # Apply chi-square selection
            selector          = SelectKBest(score_func = chi2, 
                                            k          = self.n_features)
            
            selector.fit(X_scaled, self.y)
            
            # Get selected features and scores
            selected_features = np.where(selector.get_support())[0]
            scores            = selector.scores_
            
            # Sort features by importance
            sorted_idx        = np.argsort(scores)[::-1]
            selected_features = sorted_idx[:self.n_features]
            
            print(f"Selected {len(selected_features)} features using chi-square")
            
            return selected_features, scores
            
        except Exception as e:
            raise
            
    def information_gain_selection(self) -> tuple:
        """
        Perform information gain feature selection
        
        Returns:
        --------
            { tuple } : Tuple containing: - Selected feature indices
                                          - Information gain scores
        """
        try:
            print("Performing information gain selection...")
            
            # Calculate mutual information scores
            selector          = SelectKBest(score_func = mutual_info_classif, 
                                            k          = self.n_features)
            selector.fit(self.X, self.y)
            
            # Get selected features and scores
            selected_features = np.where(selector.get_support())[0]
            scores            = selector.scores_
            
            # Sort features by importance
            sorted_idx        = np.argsort(scores)[::-1]
            selected_features = sorted_idx[:self.n_features]
            
            print(f"Selected {len(selected_features)} features using information gain")
            
            return selected_features, scores
            
        except Exception as e:
            raise
            
    def correlation_based_selection(self, threshold: float = 0.8) -> np.ndarray:
        """
        Perform correlation-based feature selection
        
        Arguments:
        ----------
            threshold { float } : Correlation threshold for feature removal
            
        Returns:
        --------
               { ndarray }      :  Selected feature indices
        """
        try:
            print("Performing correlation-based selection...")
            
            # Convert sparse matrix to dense for correlation calculation
            X_dense         = self.X.toarray()
            
            # Calculate correlation matrix
            corr_matrix     = np.corrcoef(X_dense.T)
            
            # Find highly correlated feature pairs
            high_corr_pairs = np.where(np.abs(corr_matrix) > threshold)
            
            # Keep track of features to remove
            to_remove       = set()
            
            # For each pair of highly correlated features
            for i, j in zip(*high_corr_pairs):
                if ((i != j) and (i not in to_remove) and (j not in to_remove)):
                    # Calculate correlation with target for both features
                    corr_i = mutual_info_score(X_dense[:, i], self.y)
                    corr_j = mutual_info_score(X_dense[:, j], self.y)
                    
                    # Remove feature with lower correlation to target
                    if (corr_i < corr_j):
                        to_remove.add(i)
                        
                    else:
                        to_remove.add(j)
            
            # Get selected features
            all_features      = set(range(self.X.shape[1]))
            selected_features = np.array(list(all_features - to_remove))
            
            # Select top k features if more than n_features remain
            if (len(selected_features) > self.n_features):
                # Calculate mutual information for remaining features
                mi_scores         = mutual_info_classif(self.X[:, selected_features], self.y)
                top_k_idx         = np.argsort(mi_scores)[::-1][:self.n_features]
                selected_features = selected_features[top_k_idx]
            
            print(f"Selected {len(selected_features)} features using correlation-based selection")
            
            return selected_features
            
        except Exception as e:
            raise
            
    def recursive_feature_elimination(self, estimator = None, cv: int = 5) -> tuple:
        """
        Perform Recursive Feature Elimination with cross-validation
        
        Arguments:
        ----------
            estimator  : Classifier to use (default: LogisticRegression)
            cv         : Number of cross-validation folds
            
        Returns:
        --------
            { tuple }  : Tuple containing: - Selected feature indices
                                           - Feature importance rankings
        """
        try:
            print("Performing recursive feature elimination...")
            
            # Use logistic regression if no estimator provided
            if (estimator is None):
                estimator = LogisticRegression(max_iter=1000)
            
            # Perform RFE with cross-validation
            selector = RFECV(estimator              = estimator,
                             min_features_to_select = self.n_features,
                             cv                     = cv,
                             n_jobs                 = -1)
            
            selector.fit(self.X, self.y)
            
            # Get selected features and rankings
            selected_features = np.where(selector.support_)[0]
            rankings          = selector.ranking_
            
            print(f"Selected {len(selected_features)} features using RFE")
            
            return selected_features, rankings
            
        except Exception as e:
            raise
           
        
    def forward_selection(self, estimator = None, cv: int = 5) -> np.ndarray:
        """
        Perform forward feature selection
        
        Arguments:
        ----------
            estimator : Classifier to use (default: LogisticRegression)
            
            cv        : Number of cross-validation folds
            
        Returns:
        --------
            Selected feature indices
        """
        try:
            print("Performing forward selection...")
            
            if (estimator is None):
                estimator = LogisticRegression(max_iter=1000)
            
            selected_features  = list()
            remaining_features = list(range(self.X.shape[1]))
            
            for i in tqdm(range(self.n_features)):
                best_score   = -np.inf
                best_feature = None
                
                # Try adding each remaining feature
                for feature in remaining_features:
                    current_features = selected_features + [feature]
                    X_subset         = self.X[:, current_features]
                    
                    # Calculate cross-validation score
                    scores = cross_val_score(estimator, 
                                             X_subset, 
                                             self.y,
                                             cv      = cv, 
                                             scoring = 'accuracy')
                    
                    avg_score = np.mean(scores)
                    
                    if (avg_score > best_score):
                        best_score   = avg_score
                        best_feature = feature
                
                if (best_feature is not None):
                    selected_features.append(best_feature)
                    remaining_features.remove(best_feature)
                
            print(f"Selected {len(selected_features)} features using forward selection")
            
            return np.array(selected_features)
            
        except Exception as e:
            raise
            
    def backward_elimination(self, estimator = None, cv: int = 5) -> np.ndarray:
        """
        Perform backward feature elimination
        
        Arguments:
        ----------
            estimator : Classifier to use (default: LogisticRegression)
            
            cv        : Number of cross-validation folds
            
        Returns:
        --------
            Selected feature indices
        """
        try:
            print("Performing backward elimination...")
            
            if (estimator is None):
                estimator = LogisticRegression(max_iter=1000)
            
            remaining_features = list(range(self.X.shape[1]))
            
            while len(remaining_features) > self.n_features:
                best_score    = -np.inf
                worst_feature = None
                
                # Try removing each feature
                for feature in remaining_features:
                    current_features = [f for f in remaining_features if f != feature]
                    X_subset         = self.X[:, current_features]
                    
                    # Calculate cross-validation score
                    scores = cross_val_score(estimator, 
                                             X_subset, 
                                             self.y,
                                             cv      = cv, 
                                             scoring = 'accuracy')
                    
                    avg_score = np.mean(scores)
                    
                    if (avg_score > best_score):
                        best_score    = avg_score
                        worst_feature = feature
                
                if worst_feature is not None:
                    remaining_features.remove(worst_feature)
            
            print(f"Selected {len(remaining_features)} features using backward elimination")
            return np.array(remaining_features)
            
        except Exception as e:
            raise
            

### Initialize the feature selector

In [None]:
selector = TextFeatureSelector(X             = combined_features,
                               y             = imdb_ratings_df['sentiment'].values,
                               feature_names = feature_names,
                               n_features    = None,
                              )


### Perform Feature Selection

In [None]:
# Chi-Square Selection
chi_square_features, chi_square_scores = selector.chi_square_selection()

# Information Gain Selection
#ig_features, ig_scores                 = selector.information_gain_selection()

# Correlation-Based Selection
#corr_features                          = selector.correlation_based_selection()

# Recursive Feature Elimination
#rfe_features, rfe_rankings             = selector.recursive_feature_elimination()

# Forward Selection
#forward_features                       = selector.forward_selection()

# Backward Elimination
#backward_features                      = selector.backward_elimination()


### Get selected features matrix

In [None]:
selected_combined_features  = combined_features[:, chi_square_features]

#selected_combined_features = combined_features[:, ig_features]

#selected_combined_features = combined_features[:, corr_features]

#selected_combined_features = combined_features[:, rfe_features]

#selected_combined_features = combined_features[:, forward_features]

#selected_combined_features = combined_features[:, backward_features]

In [None]:
selected_combined_features

## Sentiment Analysis Model Fitting

In [None]:
class SentimentAnalyzer:
    """
    A class for training and evaluating sentiment analysis models, including testing on unseen data
    """

    def __init__(self, X, y, feature_eng, vectorizers, selected_feature_indices, test_size=0.2, random_state=42):
        """
        Initialize the SentimentAnalyzer by splitting the data.

        Arguments:
        ----------
            X                        : Feature matrix (sparse matrix or ndarray)
            
            y                        : Target labels (array-like)
            
            feature_eng              : Instance of TextFeatureEngineering
            
            vectorizers              : Tuple of vectorizers used for feature transformation
            
            selected_feature_indices : Indices of selected features after feature selection
            
            test_size                : Proportion of data to use for testing (default: 0.2)
            
            random_state             : Random seed for reproducibility
        """
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, 
                                                                                y, 
                                                                                test_size    = test_size, 
                                                                                random_state = random_state)
        
        self.feature_eng                                     = feature_eng
        self.vectorizers                                     = vectorizers
        self.selected_feature_indices                        = selected_feature_indices

        
    def train_model(self, model_type:str = "logistic_regression", kernel=None, **kwargs):
        """
        Train a sentiment analysis model.

        Arguments:
        ----------
            model_type { str } : Type of model to train (e.g: "logistic_regression", "svm", "random_forest")
            
            kernel     { str } : Kernel type for SVM (e.g., "linear", "poly", "rbf", "sigmoid")
            
            kwargs             : Additional arguments for the model initialization

        Returns:
        --------
            Trained model
        """
        if (model_type == "logistic_regression"):
            model = LogisticRegression(max_iter = 1000, **kwargs)
            
        elif (model_type == "svm"):
            
            if (kernel is None):
                # Default kernel
                kernel = "rbf"  
                
            model = SVC(kernel = kernel, **kwargs)
            
        elif (model_type == "random_forest"):
            model = RandomForestClassifier(**kwargs)
            
        elif model_type == "naive_bayes":
            model = MultinomialNB(**kwargs)

        elif model_type == "lightgbm":
            model = LGBMClassifier(**kwargs)

        elif model_type == "logistic_model_tree":
            model = DecisionTreeClassifier(**kwargs)
        
        else:
            raise ValueError("Unsupported model_type. Choose from: 'logistic_regression', 'svm', 'random_forest'")

        print(f"Training {model_type}...")
        model.fit(self.X_train, self.y_train)
        
        return model

    def evaluate_model(self, model):
        """
        Evaluate a trained model on the test set

        Arguments:
        ----------
            model : Trained model

        Returns:
        --------
            Dictionary containing evaluation metrics
        """
        print("Evaluating model...")
        y_pred   = model.predict(self.X_test)

        accuracy = accuracy_score(self.y_test, y_pred)
        report   = classification_report(self.y_test, y_pred)
        cm       = confusion_matrix(self.y_test, y_pred)

        print(f"Accuracy: {accuracy:.4f}")
        print("Classification Report:")
        print(report)
        print("Confusion Matrix:")
        print(cm)

        return {"accuracy"              : accuracy,
                "classification_report" : report,
                "confusion_matrix"      : cm,
               }

    
    def test_on_unseen_data(self, model, unseen_texts):
        """
        Test the model on unseen data

        Arguments:
        ----------
            model         : Trained model
            
            unseen_texts  : List of unseen text data

        Returns:
        --------
            Predictions for the unseen data
        """
        print("Processing unseen data...")

        # Preprocess unseen data (implement preprocessing in the feature engineering class)
        binary_features          = self.vectorizers[0].transform(unseen_texts)
        tfidf_features           = self.vectorizers[1].transform(unseen_texts)
        bm25_features            = self.vectorizers[2].transform(unseen_texts)

        # Combine features
        unseen_combined_features = hstack([binary_features, tfidf_features, bm25_features])

        # Select features using the indices chosen during feature selection
        unseen_selected_features = unseen_combined_features[:, self.selected_feature_indices]

        # Predict sentiments
        predictions              = model.predict(unseen_selected_features)

        # Print predictions
        print("Predictions on Unseen Data:")
        for text, pred in zip(unseen_texts, predictions):
            print(f"Text: {text}\nPredicted Sentiment: {pred}\n")

        return predictions


### Initialize the analyzer

In [None]:
analyzer = SentimentAnalyzer(X                        = selected_chi_squared_features, 
                             y                        = imdb_ratings_df["sentiment"].values,
                             feature_eng              = feature_eng,
                             vectorizers              = (binary_vectorizer, tfidf_vectorizer, bm25_transformer),
                             selected_feature_indices = chi_square_features)


In [10]:
test_data = ["This movie is speedy enough with plot twists, but hard to understand the connection between plots.",
             "Seriously, this is the best movie I've ever watched! Everything was flawless!",
             "The storyline was okay, but the acting was just not up to the mark.",
             "A complete disaster of a movie. Don't waste your time.",
             "I can't believe how amazing this was. Totally worth it!",
             "The movie had its moments, but overall, it felt like something was missing.",
             "I absolutely loved the cinematography, but the acting was subpar.",
             "The film is an excellent example of how not to make a movie.",
             "It's hard to imagine how anyone could dislike this masterpiece!",
             "The trailer was better than the actual movie. Felt cheated.",
             "A rollercoaster of emotions! Highly recommend watching this.",
             "An average movie with nothing new to offer.",
             "The pacing was terrible, and the climax was predictable.",
             "Wow, just wow. This is how a movie should be made!",
             "A decent watch for a lazy weekend. Not groundbreaking, but enjoyable.",
             "The director has outdone themselves; what a phenomenal movie!",
             "More hype than substance. A complete letdown.",
             "Good visuals, decent music, but lacked a solid script.",
             "A masterpiece in every sense. This will stay with me forever.",
             "Mediocre at best. Not worth the ticket price.",
             "A fresh take on a tired genre. Highly recommend it!",
             "Overrated and boring. Nothing special about it.",
             "This is one of those movies you'll regret missing. A must-watch!",
             "Predictable plot, but the performances were top-notch.",
             "It's a bad movie if you're looking for entertainment.",
             "Can't believe I sat through the entire thing. A waste of time.",
             "Finally, a movie that gets it right. Loved every minute of it!",
             "A forgettable movie with no real impact.",
             "An extraordinary journey that left me speechless. Bravo!",
             "The humor was forced, and the dialogue was cringeworthy.",
             "A solid movie with a gripping narrative. Well done!",
             "The music was fantastic, but the rest of the movie was average.",
             "Ironic how they managed to make something so beautiful look so bland.",
             "An epic conclusion to a fantastic series. Couldn’t have been better!",
             "The movie tries too hard to be funny and fails miserably.",
             "A fresh and engaging story with relatable characters.",
             "All style, no substance. Disappointing.",
             "A breath of fresh air! One of the best movies this year.",
             "The plot was all over the place, but it was fun to watch.",
             "Couldn't make it through the first half. Painful to sit through.",
             "An unexpectedly beautiful film that touched my heart.",
             "Trying to understand why this movie exists is more entertaining than the movie itself.",
             "Every second of this movie was a blessing. Pure cinematic joy.",
             "The lead actor was the only saving grace in an otherwise dull film.",
             "A pretentious attempt at storytelling that falls flat.",
             "I didn’t expect much, but this movie surprised me in the best way.",
             "A series of poorly executed clichés masquerading as a story.",
             "This is not just a movie; it’s an experience. Brilliant!",
             "A slog of a movie with a laughably bad ending.",
            ]


sentiments = ["negative",   
              "positive",   
              "positive",   
              "negative",   
              "positive",   
              "positive",  
              "positive", 
              "negative",  
              "positive",   
              "negative",   
              "positive",   
              "positive",    
              "negative",   
              "positive",   
              "positive",    
              "positive",   
              "negative",   
              "negative",  
              "positive",   
              "negative",   
              "positive",   
              "negative",   
              "positive",  
              "positive",   
              "negative",   
              "negative",  
              "positive",   
              "negative",   
              "positive",   
              "negative",   
              "positive",   
              "positive",  
              "negative", 
              "positive",   
              "negative",   
              "positive",   
              "negative",   
              "positive",   
              "positive",  
              "negative",   
              "positive",   
              "negative",  
              "positive",  
              "positive",    
              "negative",   
              "positive",   
              "negative",  
              "positive",   
              "negative",   
              "positive",   
             ]


In [14]:
x = list(pd.read_csv('../data/test_data.csv')['Text'])
pd.to_csv(pat)

['This movie is speedy enough with plot twists, but hard to understand the connection between plots.',
 "Seriously, this is the best movie I've ever watched! Everything was flawless!",
 'The storyline was okay, but the acting was just not up to the mark.',
 "A complete disaster of a movie. Don't waste your time.",
 "I can't believe how amazing this was. Totally worth it!",
 'The movie had its moments, but overall, it felt like something was missing.',
 'I absolutely loved the cinematography, but the acting was subpar.',
 'The film is an excellent example of how not to make a movie.',
 "It's hard to imagine how anyone could dislike this masterpiece!",
 'The trailer was better than the actual movie. Felt cheated.',
 'A rollercoaster of emotions! Highly recommend watching this.',
 'An average movie with nothing new to offer.',
 'The pacing was terrible, and the climax was predictable.',
 'Wow, just wow. This is how a movie should be made!',
 'A decent watch for a lazy weekend. Not groundb

### Train a logistic regression model

In [None]:
logistic_model = analyzer.train_model(model_type = "logistic_regression")


### Evaluate the logistic regression model

In [None]:
evaluation_results = analyzer.evaluate_model(logistic_model)


### Predict using the trained model

In [None]:
logistic_predictions = analyzer.test_on_unseen_data(model        = logistic_model, 
                                                    unseen_texts = test_data)


In [None]:
all_test_data = {'texts' : test_data, 'true_labels' : sentiments, 'predicted_labels' : list(logistic_predictions)}

total_test_df = pd.DataFrame.from_dict(data   = all_test_data, 
                                       orient = 'index').T

total_test_df

### Train an SVM model with linear kernel

In [None]:
svm_model_linear = analyzer.train_model(model_type = "svm", 
                                        kernel     = "linear")


### Evaluate the SVM with RBF kernel

In [None]:
svm_evaluation_results = analyzer.evaluate_model(svm_model_linear)


### Predict using the trained model

In [None]:
svm_linear_predictions = analyzer.test_on_unseen_data(model        = svm_model_linear, 
                                                      unseen_texts = test_data)


### Train an SVM model with RBF kernel

In [None]:
svm_model_rbf    = analyzer.train_model(model_type = "svm", 
                                        kernel     = "rbf")

### Evaluate the SVM with RBF kernel

In [None]:
analyzer.evaluate_model(svm_model_rbf)


### Train an SVM model with RBF kernel

In [None]:
svm_rbf_predictions = analyzer.predict(svm_model_rbf, test_data[2])
