<a href="https://colab.research.google.com/github/rithikkulkarni/ALDA-Course-Project/blob/main/basic_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [109]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier  # Example classifier
from sklearn.linear_model import LogisticRegression # Alternative classifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [110]:
# --- Configuration ---
CSV_FILE_PATH = 'https://raw.githubusercontent.com/rithikkulkarni/ALDA-Course-Project/refs/heads/main/datasets/filtered_stocks.csv'  # <--- CHANGE THIS to your CSV file path
TARGET_COLUMN = 'stock_performance'
# TEXT_FEATURE = 'Cleaned_Tweet'
NUMERICAL_FEATURES = ['one_day_price', 'two_day_price', 'three_day_price', 'historical_price'
                         #, 'sentiment'
                      ]

TEST_SIZE = 0.2  # 20% of data for testing
RANDOM_STATE = 42 # For reproducibility

In [111]:
df = pd.read_csv(CSV_FILE_PATH)
df = df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)  # Shuffle the DataFrame
print(f"Successfully loaded data. Shape: {df.shape}")
print("\nSample data:")
df.head(1)

Successfully loaded data. Shape: (3033, 10)

Sample data:


Unnamed: 0,Date,Stock Name,sentiment,stock_performance,pct_change,one_day_price,two_day_price,three_day_price,historical_price,sentiment.1
0,2022-01-04,NIO,1,0,-5.383156,33.47,31.68,32.42,31.49,1


In [112]:
# This cell gets an equal sample from each label, so we end up with a valid accuracy that isn't inflated from too many of one label

# Find the minimum class count across the classes
min_count = df['sentiment'].value_counts().min()

# Create a balanced dataframe by sampling min_count instances from each group
df = df.groupby('sentiment').sample(n=min_count, random_state=42).reset_index(drop=True)

# Sanity check for class balance
print(df['sentiment'].value_counts())

sentiment
-1    868
 0    868
 1    868
Name: count, dtype: int64


In [113]:
# --- 2. Define Features (X) and Target (y) ---
X = df[NUMERICAL_FEATURES]
y = df[TARGET_COLUMN]

# --- 3. Split Data ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y # Important for classification, especially if classes are imbalanced
)
print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")


Training set size: 2083
Test set size: 521


In [114]:
# --- 5. Choose and Define the Model ---
# Using RandomForestClassifier as an example. It often works well with mixed data.
# You can adjust n_estimators, max_depth, etc., or try other models.
# class_weight='balanced' can help if your classes (-1, 0, 1) are imbalanced.
model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE, class_weight='balanced', n_jobs=-1)

# Alternative: Logistic Regression (often faster, good baseline)
model = LogisticRegression(multi_class='ovr', solver='liblinear', class_weight='balanced', random_state=RANDOM_STATE)

print("\nTraining the model...")
model.fit(X_train, y_train)
print("Model training complete.")

# --- 7. Make Predictions ---
print("\nMaking predictions on the test set...")
y_pred = model.predict(X_test)

# --- 8. Evaluate the Model ---
accuracy = accuracy_score(y_test, y_pred)
print(f"\n--- Model Evaluation ---")
print(f"Accuracy: {accuracy:.4f}")


print("\nClassification Report:")
# Use labels=[-1, 0, 1] to ensure all classes are shown even if one is missing in predictions
# zero_division=0 handles cases where precision/recall might be zero for a class
print(classification_report(y_test, y_pred, labels=[-1, 0, 1], target_names=['Down (-1)', 'Neutral (0)', 'Up (1)'], zero_division=0))

print("\nConfusion Matrix:")
# Rows: Actual, Columns: Predicted
cm = confusion_matrix(y_test, y_pred, labels=[-1, 0, 1])
print(pd.DataFrame(cm, index=['Actual Down', 'Actual Neutral', 'Actual Up'],
                   columns=['Predicted Down', 'Predicted Neutral', 'Predicted Up']))


Training the model...
Model training complete.

Making predictions on the test set...

--- Model Evaluation ---
Accuracy: 0.5489

Classification Report:
              precision    recall  f1-score   support

   Down (-1)       0.00      0.00      0.00         0
 Neutral (0)       0.75      0.59      0.66       390
      Up (1)       0.26      0.42      0.32       131

    accuracy                           0.55       521
   macro avg       0.34      0.34      0.33       521
weighted avg       0.63      0.55      0.58       521


Confusion Matrix:
                Predicted Down  Predicted Neutral  Predicted Up
Actual Down                  0                  0             0
Actual Neutral               0                231           159
Actual Up                    0                 76            55




In [115]:
# prompt: i now need a cell to predict a column called pct_change instead of stock_performance. the issue is that pct_change is a float instead of just being -1 0 or 1, so im not sure what model to use

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor  # Use a regressor for float target
from sklearn.metrics import mean_squared_error, r2_score

# ... (Previous code remains the same)

# --- 2. Define Features (X) and Target (y) ---
X = df[NUMERICAL_FEATURES]
y = df['pct_change']  # Target is now 'pct_change'

# --- 3. Split Data ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE
    # No need for stratify with regression
)

# --- 5. Choose and Define the Model ---
model = RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1)

print("\nTraining the model...")
model.fit(X_train, y_train)
print("Model training complete.")

# --- 7. Make Predictions ---
print("\nMaking predictions on the test set...")
y_pred = model.predict(X_test)

# --- 8. Evaluate the Model ---
mse = mean_squared_error(y_test, y_pred)
rmse = mse**0.5
r2 = r2_score(y_test, y_pred)

print(f"\n--- Model Evaluation ---")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R2): {r2:.4f}")



Training the model...
Model training complete.

Making predictions on the test set...

--- Model Evaluation ---
Mean Squared Error (MSE): 12.4437
Root Mean Squared Error (RMSE): 3.5276
R-squared (R2): -0.2084
