In [1]:
# model_training.py
#Step 1: Data Loading & Cleaning
import pandas as pd

# Load data
df = pd.read_csv("combined_stocks.csv")
# Clean column names if needed (remove leading/trailing spaces)
df.columns = df.columns.str.strip()

# Convert data types
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.dropna()  # Drop any rows with missing data

# Example view
print(df.head())


            timestamp     open    high     low    close   volume       Stock
0 2025-01-05 18:30:00  2335.95  2340.0  2256.7  2267.15  1191044  ASIANPAINT
1 2025-01-06 18:30:00  2272.00  2316.0  2272.0  2292.60   740798  ASIANPAINT
2 2025-01-07 18:30:00  2287.05  2338.0  2272.7  2334.35  1439655  ASIANPAINT
3 2025-01-08 18:30:00  2341.90  2361.4  2321.3  2350.20  1362314  ASIANPAINT
4 2025-01-09 18:30:00  2335.60  2348.9  2316.0  2320.75  1214382  ASIANPAINT


In [8]:
import pandas as pd

def engineer_features(df):
    df = df.sort_values(['Stock', 'timestamp'])
    df['next_close'] = df.groupby('Stock')['close'].shift(-1)
    df['movement'] = (df['next_close'] > df['close']).astype(int)
    df['ma5'] = df.groupby('Stock')['close'].transform(lambda x: x.rolling(5).mean())
    df['std5'] = df.groupby('Stock')['close'].transform(lambda x: x.rolling(5).std())
    df['vol_ma5'] = df.groupby('Stock')['volume'].transform(lambda x: x.rolling(5).mean())
    df = df.dropna()
    return df

# Always do this before predicting:
df = pd.read_csv("combined_stocks.csv", parse_dates=['timestamp'])
df = engineer_features(df)  # <= this line is required!

# Now your prediction code will work because those columns exist.


In [3]:
#Step 3: Train/Test Split

# Use latest data for test (simulate future)
df_train = df[df['timestamp'] < '2025-10-01']
df_test = df[df['timestamp'] >= '2025-10-01']

# Features/Target
features = ['open', 'high', 'low', 'close', 'volume', 'ma5', 'std5', 'vol_ma5']
X_train = df_train[features]
y_train = df_train['movement']
X_test = df_test[features]
y_test = df_test['movement']

In [9]:
#Step 4: Model Training

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, KFold

rf = RandomForestClassifier()

# Hyperparameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [4, 8, None]
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(rf, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV score:", grid.best_score_)

Best params: {'max_depth': None, 'n_estimators': 100}
Best CV score: 0.49607843137254903


In [10]:
#Save the model
import joblib
joblib.dump(grid.best_estimator_, "rf_stock_model.pkl")


['rf_stock_model.pkl']

In [12]:
import pandas as pd
import joblib

def engineer_features(df):
    df = df.sort_values(['Stock', 'timestamp'])
    df['next_close'] = df.groupby('Stock')['close'].shift(-1)
    df['movement'] = (df['next_close'] > df['close']).astype(int)
    df['ma5'] = df.groupby('Stock')['close'].transform(lambda x: x.rolling(5).mean())
    df['std5'] = df.groupby('Stock')['close'].transform(lambda x: x.rolling(5).std())
    df['vol_ma5'] = df.groupby('Stock')['volume'].transform(lambda x: x.rolling(5).mean())
    df = df.dropna()
    return df

def predict_best_stock(df, model):
    # Ensure features are computed
    df = engineer_features(df)
    latest_date = df['timestamp'].dt.date.max()
    today_df = df[df['timestamp'].dt.date == latest_date]
    if today_df.empty:
        return None, None
    features = ['open', 'high', 'low', 'close', 'volume', 'ma5', 'std5', 'vol_ma5']
    X_today = today_df[features]
    pred = model.predict(X_today)
    today_df = today_df.assign(pred_movement=pred)
    up_stocks = today_df[today_df['pred_movement'] == 1]
    if not up_stocks.empty:
        best = up_stocks.sort_values('ma5', ascending=False).iloc[0]
    else:
        best = today_df.iloc[0]
    out = {
        "symbol": best['Stock'],
        "open": best['open'],
        "close": best['close'],
        "pred_movement": 'up' if best['pred_movement'] == 1 else 'down'
    }
    return out, today_df

# Usage example
df = pd.read_csv("combined_stocks.csv", parse_dates=['timestamp'])
df.columns = df.columns.str.strip()
model = joblib.load("rf_stock_model.pkl")
best_stock, predictions_df = predict_best_stock(df, model)
print("Best stock for latest date:", best_stock)


Best stock for latest date: {'symbol': 'INFY', 'open': np.float64(1618.5), 'close': np.float64(1618.7), 'pred_movement': 'up'}
