Import Necessary Libraries

In [5]:
import pandas as pd
import re

In [6]:
# Step 1: Load the data
def load_data(filepath):
    """
    Load the dataset from a CSV file.
    
    :param filepath: Path to the CSV file.
    :return: pandas DataFrame
    """
    return pd.read_csv(filepath)

# Step 2: Preprocess the data
def preprocess_data(df):
    """
    Preprocess the dataset by performing basic cleaning.
    
    :param df: pandas DataFrame containing the dataset.
    :return: Cleaned pandas DataFrame
    """
    # Remove any rows with missing values
    df.dropna(inplace=True)
    # Reset the index after dropping rows
    df.reset_index(drop=True, inplace=True)
    return df

# Step 3: Remove useless elements
def clean_text(text):
    """
    Clean the text by removing usernames, URLs, symbols, and useless numbers.
    
    :param text: Original text.
    :return: Cleaned text.
    """
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove usernames
    text = re.sub(r'@\w+', '', text)
    # Remove special characters, numbers, punctuations
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def clean_data(df, text_column):
    """
    Apply the clean_text function to a specific column in the DataFrame.
    
    :param df: pandas DataFrame containing the dataset.
    :param text_column: The name of the column containing text data.
    :return: pandas DataFrame with cleaned text.
    """
    df[text_column] = df[text_column].apply(clean_text)
    return df

# Step 4: Save the cleaned data
def save_data(df, output_filepath):
    """
    Save the cleaned DataFrame to a CSV file.
    
    :param df: pandas DataFrame containing the cleaned dataset.
    :param output_filepath: Path where the cleaned CSV will be saved.
    """
    df.to_csv(output_filepath, index=False)





In [7]:
# Main function to execute the steps
def main():
    input_filepath = 'labeled_data.csv'
    output_filepath = 'cleaned_data.csv'
    text_column = 'tweet'  # Update this if your text column has a different name

    # Load data
    df = load_data(input_filepath)
    
    # Preprocess data
    df = preprocess_data(df)
    
    # Clean data
    df = clean_data(df, text_column)
    
    # Save cleaned data
    save_data(df, output_filepath)

# Execute the main function
if __name__ == "__main__":
    main()