In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# Load the preprocessed data
df = pd.read_csv('../data/processed/cleaned_tokenized_sentiment140.csv')

# Clean and prepare the tokens
def clean_tokens(tokens):
    if isinstance(tokens, str):
        return tokens
    elif pd.isna(tokens):
        return ''
    else:
        return ' '.join(str(token) for token in tokens if pd.notna(token))

df['cleaned_tokens'] = df['tokens'].apply(clean_tokens)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['cleaned_tokens'])

# Create additional features
df['tweet_length'] = df['cleaned_tokens'].apply(len)

# Combine TF-IDF features with additional features
X_combined = np.hstack((X.toarray(), df[['tweet_length']].values))

# Get the target variable
y = df['target'].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Handle any remaining NaN values
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Save the features and labels
np.save('../data/processed/X_train.npy', X_train)
np.save('../data/processed/X_test.npy', X_test)
np.save('../data/processed/y_train.npy', y_train)
np.save('../data/processed/y_test.npy', y_test)

print("Feature engineering completed and data saved.")

Feature engineering completed and data saved.
