In [1]:
import pandas as pd

def reduce_twitter_dataset(input_file, output_file, sample_size=50000):
    """
    Reduces the Sentiment140 dataset while maintaining class balance.
    Columns: 0: target, 1: id, 2: date, 3: flag, 4: user, 5: text
    """
    cols = ["sentiment", "id", "date", "query", "user", "text"]

    print("Reading dataset in chunks...")
    # Using 'latin-1' encoding as this dataset often has special characters
    chunks = pd.read_csv(input_file, encoding='latin-1', header=None, names=cols, chunksize=100000)

    df_list = []
    for chunk in chunks:
        # Keep only necessary columns to save memory immediately
        df_list.append(chunk[['sentiment', 'text']])

    full_df = pd.concat(df_list)

    # Stratified sampling: Ensure 50/50 split of Positive (4) and Negative (0)
    # The dataset uses 0=negative, 2=neutral (rare), 4=positive
    negative_tweets = full_df[full_df['sentiment'] == 0].sample(n=sample_size//2, random_state=42)
    positive_tweets = full_df[full_df['sentiment'] == 4].sample(n=sample_size//2, random_state=42)

    reduced_df = pd.concat([negative_tweets, positive_tweets]).sample(frac=1).reset_index(drop=True)

    reduced_df.to_csv(output_file, index=False)
    print(f"Success! Reduced file saved to {output_file} with {len(reduced_df)} rows.")

# Usage
# reduce_twitter_dataset('training.1600000.processed.noemoticon.csv', 'reduced_sentiment.csv')