In [2]:
import pandas as pd

class TextDataProcessor:
    def __init__(self, file_path):
        self.file_path = file_path
        self.df = None
        self.emotion_columns = [
            'Anger yes / no', 'Anticipation yes / no', 'Disgust yes / no',
            'Fear yes / no', 'Joy yes / no', 'Sadness yes / no',
            'Surprise yes / no', 'Trust yes / no'
        ]

    def load_data(self):
        self.df = pd.read_csv(self.file_path)
        self.df.replace('---', pd.NA, inplace=True)

    def preprocess_text(self):
        self.df['Text'] = self.df['Text'].str.lower()
        self.df['Text'] = self.df['Text'].str.replace(r'http\S+', '', regex=True)
        self.df['Text'] = self.df['Text'].str.replace(r'@\w+', '', regex=True)
        self.df['Text'] = self.df['Text'].str.replace(r'#\w+', '', regex=True)
        self.df['Text'] = self.df['Text'].str.replace(r'[^\w\s]', '', regex=True)
        self.df['Text'] = self.df['Text'].str.replace(r'\s+', ' ', regex=True)
        self.df['Text'] = self.df['Text'].str.strip()

    def convert_emotions_to_binary(self):
        for column in self.emotion_columns:
            self.df[column] = self.df[column].notna().astype(int)

    def get_data(self):
        return self.df


In [None]:
# Assuming the file path is set correctly
file_path = '/mnt/data/train_3.csv'
processor = TextDataProcessor(file_path)
processor.load_data()
processor.preprocess_text()
processor.convert_emotions_to_binary()
tfidf_df = processor.apply_tfidf()

tfidf_df.head()