In [17]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer  

ticker_symbol = 'AAPL'
stock_data = yf.Ticker(ticker_symbol)

df_5min = stock_data.history(period="5d", interval="5m")

df_1hour = stock_data.history(period="5d", interval="1h")

df_daily = stock_data.history(period='1y')

def preprocess_data(df):
    df.index = pd.to_datetime(df.index).tz_localize(None)
    df.reset_index(inplace=True)

    df['EMA_5'] = df['Close'].ewm(span=5, adjust=False).mean()
    df['EMA_21'] = df['Close'].ewm(span=21, adjust=False).mean()
    df['Crossover'] = (df['EMA_5'] > df['EMA_21']) & (df['EMA_5'].shift(1) <= df['EMA_21'].shift(1))

    df['Price_Change'] = df['Close'].diff()
    df['Gain'] = df['Price_Change'].where(df['Price_Change'] > 0, 0)
    df['Loss'] = -df['Price_Change'].where(df['Price_Change'] < 0, 0)

    df['Avg_Gain'] = df['Gain'].rolling(window=14, min_periods=1).mean()
    df['Avg_Loss'] = df['Loss'].rolling(window=14, min_periods=1).mean()

    df['RS'] = df['Avg_Gain'] / df['Avg_Loss']
    df['RSI'] = 100 - (100 / (1 + df['RS']))

    df['RSI_Above_50'] = df['RSI'] >= 50

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.bfill(inplace=True) 

    return df

df_5min = preprocess_data(df_5min)
df_1hour = preprocess_data(df_1hour)
df_daily = preprocess_data(df_daily)

def create_targets(df, target_shift):
    df['Next_Close'] = df['Close'].shift(-target_shift)
    df['Price_Up'] = (df['Next_Close'] > df['Close']).astype(int)
    df.dropna(inplace=True) 
    return df

df_5min = create_targets(df_5min, target_shift=1)  
df_1hour = create_targets(df_1hour, target_shift=1) 

df_daily['Next_Week_Close'] = df_daily['Close'].shift(-5) 
df_daily['Price_Up_1_Week'] = (df_daily['Next_Week_Close'] > df_daily['Close']).astype(int)

df_daily['Next_15_Days_Close'] = df_daily['Close'].shift(-15) 
df_daily['Price_Up_15_Days'] = (df_daily['Next_15_Days_Close'] > df_daily['Close']).astype(int)

df_daily.dropna(inplace=True)

features = ['EMA_5', 'EMA_21', 'Crossover', 'RSI', 'RSI_Above_50', 'Price_Change', 'Gain', 'Loss', 'Avg_Gain', 'Avg_Loss', 'RS']

X_5min = df_5min[features]
X_1hour = df_1hour[features]
X_daily = df_daily[features]

y_1week = df_daily['Price_Up_1_Week']
y_15days = df_daily['Price_Up_15_Days']

y_5min = df_5min['Price_Up']
y_1hour = df_1hour['Price_Up']

imputer = SimpleImputer(strategy='mean')
X_5min_imputed = imputer.fit_transform(X_5min)
X_1hour_imputed = imputer.fit_transform(X_1hour)
X_daily_imputed = imputer.fit_transform(X_daily)

scaler = StandardScaler()
X_5min_scaled = scaler.fit_transform(X_5min_imputed)
X_1hour_scaled = scaler.fit_transform(X_1hour_imputed)
X_daily_scaled = scaler.fit_transform(X_daily_imputed)

X_train_5min, X_test_5min, y_train_5min, y_test_5min = train_test_split(X_5min_scaled, y_5min, test_size=0.2, random_state=42)
X_train_1hour, X_test_1hour, y_train_1hour, y_test_1hour = train_test_split(X_1hour_scaled, y_1hour, test_size=0.2, random_state=42)

X_train_daily, X_test_daily, y_train_1week, y_test_1week = train_test_split(X_daily_scaled, y_1week, test_size=0.2, random_state=42)
X_train_daily_15, X_test_daily_15, y_train_15days, y_test_15days = train_test_split(X_daily_scaled, y_15days, test_size=0.2, random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

def train_rf_model(X_train, y_train, X_test, y_test):
    grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    return best_model

print("\nTraining and Evaluating 5-Minute Model...")
best_model_5min = train_rf_model(X_train_5min, y_train_5min, X_test_5min, y_test_5min)

print("\nTraining and Evaluating 1-Hour Model...")
best_model_1hour = train_rf_model(X_train_1hour, y_train_1hour, X_test_1hour, y_test_1hour)

print("\nTraining and Evaluating 1-Week Model...")
best_model_1week = train_rf_model(X_train_daily, y_train_1week, X_test_daily, y_test_1week)

print("\nTraining and Evaluating 15-Days Model...")
best_model_15days = train_rf_model(X_train_daily_15, y_train_15days, X_test_daily_15, y_test_15days)

def predict_next(model, X_scaled):
    last_data = X_scaled[-1:].reshape(1, -1)
    prediction = model.predict(last_data)
    return prediction

print("\nPredictions for the next period:")
print(f"Next 5-Minute Prediction (1 = Up, 0 = Down): {predict_next(best_model_5min, X_5min_scaled)}")
print(f"Next 1-Hour Prediction (1 = Up, 0 = Down): {predict_next(best_model_1hour, X_1hour_scaled)}")
print(f"Next 1-Week Prediction (1 = Up, 0 = Down): {predict_next(best_model_1week, X_daily_scaled)}")
print(f"Next 15-Days Prediction (1 = Up, 0 = Down): {predict_next(best_model_15days, X_daily_scaled)}")



Training and Evaluating 5-Minute Model...
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Accuracy: 0.5352
              precision    recall  f1-score   support

           0       0.52      0.35      0.42        34
           1       0.54      0.70      0.61        37

    accuracy                           0.54        71
   macro avg       0.53      0.53      0.52        71
weighted avg       0.53      0.54      0.52        71

[[12 22]
 [11 26]]

Training and Evaluating 1-Hour Model...
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Accuracy: 0.6667
              precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       0.60      1.00      0.75         3

    accuracy                         

In [19]:
import joblib

# Save trained models
joblib.dump(best_model_5min, 'model_5min.pkl')
joblib.dump(best_model_1hour, 'model_1hour.pkl')
joblib.dump(best_model_1week, 'model_1week.pkl')
joblib.dump(best_model_15days, 'model_15days.pkl')


['model_15days.pkl']

In [16]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import joblib

# Function to get stock data
def get_stock_data(ticker):
    stock_data = yf.Ticker(ticker)
    df_5min = stock_data.history(period="5d", interval="5m")
    df_1hour = stock_data.history(period="5d", interval="1h")
    df_daily = stock_data.history(period='1y')
    return df_5min, df_1hour, df_daily

# Preprocessing function
def preprocess_data(df):
    df.index = pd.to_datetime(df.index).tz_localize(None)
    df.reset_index(inplace=True)

    df['EMA_5'] = df['Close'].ewm(span=5, adjust=False).mean()
    df['EMA_21'] = df['Close'].ewm(span=21, adjust=False).mean()
    df['Crossover'] = (df['EMA_5'] > df['EMA_21']) & (df['EMA_5'].shift(1) <= df['EMA_21'].shift(1))

    df['Price_Change'] = df['Close'].diff()
    df['Gain'] = df['Price_Change'].where(df['Price_Change'] > 0, 0)
    df['Loss'] = -df['Price_Change'].where(df['Price_Change'] < 0, 0)

    df['Avg_Gain'] = df['Gain'].rolling(window=14, min_periods=1).mean()
    df['Avg_Loss'] = df['Loss'].rolling(window=14, min_periods=1).mean()

    df['RS'] = df['Avg_Gain'] / df['Avg_Loss']
    df['RSI'] = 100 - (100 / (1 + df['RS']))

    df['RSI_Above_50'] = df['RSI'] >= 50

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.bfill(inplace=True) 

    return df

# Creating Targets
def create_targets(df, target_shift):
    df['Next_Close'] = df['Close'].shift(-target_shift)
    df['Price_Up'] = (df['Next_Close'] > df['Close']).astype(int)
    df.dropna(inplace=True) 
    return df

# Model training function
def train_rf_model(X_train, y_train):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

# Function to prepare data, train the model, and save it
def prepare_and_train_model(ticker):
    # Get stock data
    df_5min, df_1hour, df_daily = get_stock_data(ticker)

    # Preprocess data
    df_5min = preprocess_data(df_5min)
    df_1hour = preprocess_data(df_1hour)
    df_daily = preprocess_data(df_daily)

    # Create targets
    df_5min = create_targets(df_5min, target_shift=1)
    df_1hour = create_targets(df_1hour, target_shift=1)
    df_daily['Next_Week_Close'] = df_daily['Close'].shift(-5) 
    df_daily['Price_Up_1_Week'] = (df_daily['Next_Week_Close'] > df_daily['Close']).astype(int)
    df_daily.dropna(inplace=True)

    # Features for the model
    features = ['EMA_5', 'EMA_21', 'Crossover', 'RSI', 'RSI_Above_50', 'Price_Change', 'Gain', 'Loss', 'Avg_Gain', 'Avg_Loss', 'RS']

    X_5min = df_5min[features]
    y_5min = df_5min['Price_Up']
    
    # Imputer and Scaler for handling missing data and scaling
    imputer = SimpleImputer(strategy='mean')
    scaler = StandardScaler()
    
    X_5min_imputed = imputer.fit_transform(X_5min)
    X_5min_scaled = scaler.fit_transform(X_5min_imputed)
    
    # Train-Test Split
    X_train_5min, X_test_5min, y_train_5min, y_test_5min = train_test_split(X_5min_scaled, y_5min, test_size=0.2, random_state=42)
    
    # Train model
    model = train_rf_model(X_train_5min, y_train_5min)
    
    # Save the model using joblib
    joblib.dump(model, f'{ticker}_5min_model.pkl')
    joblib.dump(imputer, f'{ticker}_imputer.pkl')
    joblib.dump(scaler, f'{ticker}_scaler.pkl')

    return model, imputer, scaler

# Specify the stock ticker (for example AAPL)
ticker = 'GOOGL'  # Change this as needed

# Train the model and save it
model, imputer, scaler = prepare_and_train_model(ticker)

print(f"Model for {ticker} has been trained and saved!")


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Model for GOOGL has been trained and saved!


In [14]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import joblib  # Import joblib for saving and loading models

# Function to fetch stock data and process it
def fetch_stock_data(ticker_symbol):
    stock_data = yf.Ticker(ticker_symbol)
    
    # Downloading stock data with different time intervals
    df_5min = stock_data.history(period="5d", interval="5m")
    df_1hour = stock_data.history(period="5d", interval="1h")
    df_daily = stock_data.history(period='1y')
    
    return df_5min, df_1hour, df_daily

# Data preprocessing function
def preprocess_data(df):
    df.index = pd.to_datetime(df.index).tz_localize(None)
    df.reset_index(inplace=True)

    df['EMA_5'] = df['Close'].ewm(span=5, adjust=False).mean()
    df['EMA_21'] = df['Close'].ewm(span=21, adjust=False).mean()
    df['Crossover'] = (df['EMA_5'] > df['EMA_21']) & (df['EMA_5'].shift(1) <= df['EMA_21'].shift(1))

    df['Price_Change'] = df['Close'].diff()
    df['Gain'] = df['Price_Change'].where(df['Price_Change'] > 0, 0)
    df['Loss'] = -df['Price_Change'].where(df['Price_Change'] < 0, 0)

    df['Avg_Gain'] = df['Gain'].rolling(window=14, min_periods=1).mean()
    df['Avg_Loss'] = df['Loss'].rolling(window=14, min_periods=1).mean()

    df['RS'] = df['Avg_Gain'] / df['Avg_Loss']
    df['RSI'] = 100 - (100 / (1 + df['RS']))

    df['RSI_Above_50'] = df['RSI'] >= 50

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.bfill(inplace=True) 

    return df

# Target creation for prediction
def create_targets(df, target_shift):
    df['Next_Close'] = df['Close'].shift(-target_shift)
    df['Price_Up'] = (df['Next_Close'] > df['Close']).astype(int)
    df.dropna(inplace=True) 
    return df

# Function to train Random Forest model
def train_rf_model(X_train, y_train, X_test, y_test, model_name):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    
    grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    
    # Saving the model using joblib
    joblib.dump(best_model, f'{model_name}_rf_model.pkl')
    print(f"Model saved as {model_name}_rf_model.pkl")
    
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    
    return best_model

# Function to load a saved model
def load_model(model_name):
    model = joblib.load(f'{model_name}_rf_model.pkl')
    print(f"Model loaded from {model_name}_rf_model.pkl")
    return model

# Function to make predictions
def predict_next(model, X_scaled):
    last_data = X_scaled[-1:].reshape(1, -1)
    prediction = model.predict(last_data)
    return prediction

# Main function to execute the workflow
def main(ticker_symbol):
    # Fetch stock data
    df_5min, df_1hour, df_daily = fetch_stock_data(ticker_symbol)
    
    # Preprocess data
    df_5min = preprocess_data(df_5min)
    df_1hour = preprocess_data(df_1hour)
    df_daily = preprocess_data(df_daily)

    # Create targets
    df_5min = create_targets(df_5min, target_shift=1)  
    df_1hour = create_targets(df_1hour, target_shift=1) 
    df_daily['Next_Week_Close'] = df_daily['Close'].shift(-5) 
    df_daily['Price_Up_1_Week'] = (df_daily['Next_Week_Close'] > df_daily['Close']).astype(int)

    df_daily['Next_15_Days_Close'] = df_daily['Close'].shift(-15) 
    df_daily['Price_Up_15_Days'] = (df_daily['Next_15_Days_Close'] > df_daily['Close']).astype(int)
    df_daily.dropna(inplace=True)

    # Define features
    features = ['EMA_5', 'EMA_21', 'Crossover', 'RSI', 'RSI_Above_50', 'Price_Change', 'Gain', 'Loss', 'Avg_Gain', 'Avg_Loss', 'RS']

    # Split data into features and targets
    X_5min = df_5min[features]
    X_1hour = df_1hour[features]
    X_daily = df_daily[features]

    y_5min = df_5min['Price_Up']
    y_1hour = df_1hour['Price_Up']
    y_1week = df_daily['Price_Up_1_Week']
    y_15days = df_daily['Price_Up_15_Days']

    # Impute and scale the data
    imputer = SimpleImputer(strategy='mean')
    scaler = StandardScaler()
    
    X_5min_scaled = scaler.fit_transform(imputer.fit_transform(X_5min))
    X_1hour_scaled = scaler.fit_transform(imputer.fit_transform(X_1hour))
    X_daily_scaled = scaler.fit_transform(imputer.fit_transform(X_daily))

    # Split the data into train and test sets
    X_train_5min, X_test_5min, y_train_5min, y_test_5min = train_test_split(X_5min_scaled, y_5min, test_size=0.2, random_state=42)
    X_train_1hour, X_test_1hour, y_train_1hour, y_test_1hour = train_test_split(X_1hour_scaled, y_1hour, test_size=0.2, random_state=42)
    X_train_daily, X_test_daily, y_train_1week, y_test_1week = train_test_split(X_daily_scaled, y_1week, test_size=0.2, random_state=42)
    X_train_daily_15, X_test_daily_15, y_train_15days, y_test_15days = train_test_split(X_daily_scaled, y_15days, test_size=0.2, random_state=42)

    # Train and save models for different time intervals
    print("\nTraining and Saving 5-Minute Model...")
    best_model_5min = train_rf_model(X_train_5min, y_train_5min, X_test_5min, y_test_5min, f'{ticker_symbol}_5min')

    print("\nTraining and Saving 1-Hour Model...")
    best_model_1hour = train_rf_model(X_train_1hour, y_train_1hour, X_test_1hour, y_test_1hour, f'{ticker_symbol}_1hour')

    print("\nTraining and Saving 1-Week Model...")
    best_model_1week = train_rf_model(X_train_daily, y_train_1week, X_test_daily, y_test_1week, f'{ticker_symbol}_1week')

    print("\nTraining and Saving 15-Days Model...")
    best_model_15days = train_rf_model(X_train_daily_15, y_train_15days, X_test_daily_15, y_test_15days, f'{ticker_symbol}_15days')

    # Example predictions
    print("\nPredictions for the next period:")
    print(f"Next 5-Minute Prediction (1 = Up, 0 = Down): {predict_next(best_model_5min, X_5min_scaled)}")
    print(f"Next 1-Hour Prediction (1 = Up, 0 = Down): {predict_next(best_model_1hour, X_1hour_scaled)}")
    print(f"Next 1-Week Prediction (1 = Up, 0 = Down): {predict_next(best_model_1week, X_daily_scaled)}")
    print(f"Next 15-Days Prediction (1 = Up, 0 = Down): {predict_next(best_model_15days, X_daily_scaled)}")

# Run the model for a given stock ticker (can change 'AAPL' to any stock symbol)
ticker_symbol = 'AAPL'
main(ticker_symbol)



Training and Saving 5-Minute Model...
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Model saved as AAPL_5min_rf_model.pkl
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Accuracy: 0.5352
              precision    recall  f1-score   support

           0       0.52      0.35      0.42        34
           1       0.54      0.70      0.61        37

    accuracy                           0.54        71
   macro avg       0.53      0.53      0.52        71
weighted avg       0.53      0.54      0.52        71

[[12 22]
 [11 26]]

Training and Saving 1-Hour Model...
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Model saved as AAPL_1hour_rf_model.pkl
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Accuracy: 0.6667
              precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       0.60