In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier  # Example classifier
from sklearn.linear_model import LogisticRegression # Alternative classifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [7]:
# --- Configuration ---
CSV_FILE_PATH = 'datasets/labeled_stock_tweets_with_historical_prices.csv'  # <--- CHANGE THIS to your CSV file path
TARGET_COLUMN = 'stock_label'
# TEXT_FEATURE = 'Cleaned_Tweet'
NUMERICAL_FEATURES = ['one_day_price', 'two_day_price', 'three_day_price', 'historical_price'
                      , 'Label'
                      ]
# CATEGORICAL_FEATURES = ['Stock Name'] # Assuming 'Stock Name' is the primary identifier
# You could add 'Company Name' here too, but it might be redundant if Stock Name is unique
# CATEGORICAL_FEATURES = ['Stock Name', 'Company Name']

TEST_SIZE = 0.2  # 20% of data for testing
RANDOM_STATE = 42 # For reproducibility

In [8]:
try:
    df = pd.read_csv(CSV_FILE_PATH)
    df = df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)  # Shuffle the DataFrame
    print(f"Successfully loaded data. Shape: {df.shape}")
    print("\nSample data:")
    print(df.head())
    print(f"\nTarget variable distribution:\n{df[TARGET_COLUMN].value_counts(normalize=True)}")
except FileNotFoundError:
    print(f"Error: File not found at {CSV_FILE_PATH}")
    exit()
except Exception as e:
    print(f"Error loading data: {e}")
    exit()
print(df['stock_label'].value_counts())

Successfully loaded data. Shape: (20935, 11)

Sample data:
         Date                                              Tweet Stock Name  \
0  2022-07-28  @TSM_Albralelie I am sure you'll find somethin...        TSM   
1  2022-04-14            The factories are firing back on - $NIO        NIO   
2  2022-05-31  watchlist for tomorrow: | $SPX | $CRM | $KHC |...        CRM   
3  2022-08-12  Very interesting data from Jefferies showing C...        AMD   
4  2022-07-22  Argus analyst Bill Selesky lowered the price t...       TSLA   

                                        Company Name  Label  \
0  Taiwan Semiconductor Manufacturing Company Lim...      1   
1                                           NIO Inc.     -1   
2                                   Salesforce, Inc.      0   
3                       Advanced Micro Devices, Inc.      1   
4                                        Tesla, Inc.      0   

                                       Cleaned_Tweet  stock_label  \
0  user sure youll

In [9]:
# --- 2. Define Features (X) and Target (y) ---
X = df[NUMERICAL_FEATURES]
y = df[TARGET_COLUMN]

# --- 3. Split Data ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y # Important for classification, especially if classes are imbalanced
)
print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")


Training set size: 16748
Test set size: 4187


In [10]:

# --- 5. Choose and Define the Model ---
# Using RandomForestClassifier as an example. It often works well with mixed data.
# You can adjust n_estimators, max_depth, etc., or try other models.
# class_weight='balanced' can help if your classes (-1, 0, 1) are imbalanced.
model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE, class_weight='balanced', n_jobs=-1)

# Alternative: Logistic Regression (often faster, good baseline)
# model = LogisticRegression(multi_class='ovr', solver='liblinear', class_weight='balanced', random_state=RANDOM_STATE)

print("\nTraining the model...")
model.fit(X_train, y_train)
print("Model training complete.")

# --- 7. Make Predictions ---
print("\nMaking predictions on the test set...")
y_pred = model.predict(X_test)

# --- 8. Evaluate the Model ---
accuracy = accuracy_score(y_test, y_pred)
print(f"\n--- Model Evaluation ---")
print(f"Accuracy: {accuracy:.4f}")


print("\nClassification Report:")
# Use labels=[-1, 0, 1] to ensure all classes are shown even if one is missing in predictions
# zero_division=0 handles cases where precision/recall might be zero for a class
print(classification_report(y_test, y_pred, labels=[-1, 0, 1], target_names=['Down (-1)', 'Neutral (0)', 'Up (1)'], zero_division=0))

print("\nConfusion Matrix:")
# Rows: Actual, Columns: Predicted
cm = confusion_matrix(y_test, y_pred, labels=[-1, 0, 1])
print(pd.DataFrame(cm, index=['Actual Down', 'Actual Neutral', 'Actual Up'],
                   columns=['Predicted Down', 'Predicted Neutral', 'Predicted Up']))


Training the model...
Model training complete.

Making predictions on the test set...

--- Model Evaluation ---
Accuracy: 0.9690

Classification Report:
              precision    recall  f1-score   support

   Down (-1)       0.97      0.96      0.97      1252
 Neutral (0)       0.97      0.97      0.97      1626
      Up (1)       0.97      0.97      0.97      1309

    accuracy                           0.97      4187
   macro avg       0.97      0.97      0.97      4187
weighted avg       0.97      0.97      0.97      4187


Confusion Matrix:
                Predicted Down  Predicted Neutral  Predicted Up
Actual Down               1204                 30            18
Actual Neutral              23               1579            24
Actual Up                   15                 20          1274
