In [None]:
import yfinance as yf
import datetime

In [None]:
# DOWNLOAD STOCK DATA
import pandas as pd

sp500_symbols = pd.read_csv('https://datahub.io/core/s-and-p-500-companies/r/constituents.csv')['Symbol']

stock_data = pd.DataFrame()


sector_info = {}

for symbol in sp500_symbols:
    try:
        stock = yf.download(symbol, period="5y")
        
        if symbol not in sector_info:
            stock_info = yf.Ticker(symbol).info
            sector = stock_info.get("sector", "N/A")
            sector_info[symbol] = sector
        
        stock['Symbol'] = symbol
        stock['Sector'] = sector_info[symbol]
        
        stock_data = pd.concat([stock_data, stock])
    except Exception as e:
        print(f"Error {symbol}: {e}")

In [None]:
stock_data.to_csv('stock_data.csv')

In [None]:
stock_data= stock_data.reset_index()

In [None]:
df_modeling = stock_data

In [None]:
stock_data[stock_data['Symbol'] == 'AAPL']

In [None]:
#plotting tech stocks over a 2 year period 
import plotly.express as px

tech_stocks = ['AAPL','GOOGL','MSFT']

for tech in tech_stocks:
    
    df = stock_data[stock_data['Symbol'] == tech]

    fig = px.line(df, x="Date", y='Adj Close', title=tech,
                  labels={'Adj Close': 'Stock Price (USD)'}, template='plotly_dark')

    fig.update_xaxes(title_text='Date')
    fig.update_yaxes(title_text='Stock Price (USD)')
    fig.update_layout(showlegend=True)

    fig.show()

In [None]:
#plotting some real estate stocks over 2 year period 
restate_stocks = ['PLD','AMT','EQIX']

for r in restate_stocks:
    
    df = stock_data[stock_data['Symbol'] == r]

    fig = px.line(df, x="Date", y='Adj Close', title=r,
                  labels={'Adj Close': 'Stock Price (USD)'}, template='plotly_dark')

    fig.update_xaxes(title_text='Date')
    fig.update_yaxes(title_text='Stock Price (USD)')
    fig.update_layout(showlegend=True)

    fig.show()

In [None]:
stock_data['Daily_Return'] = stock_data['Close'].pct_change()


In [None]:
short_window = 5  
long_window = 20  

stock_data['Short_MA'] = stock_data['Close'].rolling(window=short_window).mean()
stock_data['Long_MA'] = stock_data['Close'].rolling(window=long_window).mean()

In [None]:
import matplotlib.pyplot as plt

#EDA of moving avg 
df = stock_data

tech_stocks = ['AAPL','GOOGL','MSFT']

for tech in tech_stocks:
    plt.figure(figsize=(12, 6))
    plt.plot(df[df['Symbol']==tech]['Date'], df[df['Symbol']==tech]['Close'], label='Close Price', color='blue', alpha=0.7)
    plt.plot(df[df['Symbol']==tech]['Date'], df[df['Symbol']==tech]['Short_MA'], label=f'Short_Moving_Avg ({short_window} days)', color='orange')
    plt.plot(df[df['Symbol']==tech]['Date'], df[df['Symbol']==tech]['Long_MA'], label=f'Long_Moving_avg ({long_window} days)', color='green')

    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.title('Stock Price with Short-term and Long-term Moving Averages')
    plt.legend()
    plt.xticks(rotation=90)
    plt.grid(True)

In [None]:
#to understand the hypothesis i wanted to look at the correlation between the percent change between the different sectors 
#i would assume that families from different sectors would have a correlation of daily pct change 

import seaborn as sns
sp500_sectors = [
    'XLC', 'XLY', 'XLC', 'XLF', 'XLV', 'XLI', 'XLB', 'XLRE', 'XLK', 'XLU',
    'XPH', 'XME', 'XES', 'XOP', 'XRT', 'XHB', 'XSD', 'XLRE', 'XLRE', 'XLRE'
]

sector_data = {}
for sector in sp500_sectors:
    stock = yf.download(sector, period="2y")
    sector_data[sector] = stock['Adj Close']

returns_df = pd.DataFrame({sector: data.pct_change() for sector, data in sector_data.items()})

correlation_matrix = returns_df.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap of S&P 500 Sectors')
plt.show()

In [None]:
sp500_data = stock_data
sp500_data.set_index('Date', inplace=True)
def label_monthly(data):
    #this is my labeling method
    #if the price of the current month is higher than 10% compared to the pervious month this is a price shock
    #if the price of the current month is higher than 2% compared to the pervious month this is a Up stock
    #if its lower than -2% its a down
    #other its a flat
    if len(data) < 2:
        return pd.Series({'Monthly_Label': 'N/A', 'Close': data['Close'].iloc[-1]})  # Label as 'N/A' and include the last 'Close' value

    first_close = data['Close'].iloc[0]
    last_close = data['Close'].iloc[-1]

    price_change_percent = ((last_close - first_close) / first_close) * 100

    if price_change_percent > 10:
        return pd.Series({'Monthly_Label': 'Price Shock', 'Close': last_close})
    elif price_change_percent < -2:
        return pd.Series({'Monthly_Label': 'Down', 'Close': last_close})
    elif price_change_percent > 2:
        return pd.Series({'Monthly_Label': 'Up', 'Close': last_close})
    else:
        return pd.Series({'Monthly_Label': 'Flat', 'Close': last_close})

labels_df = sp500_data.groupby('Symbol').resample('M').apply(lambda x: label_monthly(x)).reset_index()

print(labels_df)




In [None]:
labels_df.head()

In [None]:
def plot_stock_price_categories(df, tech_stock):
    #graphing the nuances in the 2 year period
    df = df[df['Symbol'] == tech_stock]
    grouped = df.groupby('Monthly_Label')

    fig, ax = plt.subplots(figsize=(10, 6))

    colors = {'Flat': 'gray', 'Down': 'red', 'Up': 'green', 'Price Shock': 'blue'}

    for label, group in grouped:
        ax.scatter(group['Date'], group['Close'], label=label, color=colors[label], alpha=0.7)

    plt.xlabel('Date')
    plt.ylabel('Close Price')
    plt.title(f'{tech_stock} Stock Price Categories')

    plt.legend()

    plt.show()

tech_stocks = ['AAPL', 'MSFT', 'GOOGL','NVDA']

for tech_stock in tech_stocks:
    plot_stock_price_categories(labels_df, tech_stock)

In [None]:
plt.hist(labels_df.Monthly_Label)

In [None]:
df_stock_data = stock_data

In [None]:
window_size = 730

df_stock_data['First_Close'] = df_stock_data.groupby('Symbol')['Close'].transform('first')
df_stock_data['Last_Close'] = df_stock_data.groupby('Symbol')['Close'].transform('last')

def label_stock(group):
    #labeling the stock over a 2 year period
    #if its up 5% in 2 years its up
    #if its down 5% its down
    # other its flat
    first_close = group['First_Close'].iloc[0]
    last_close = group['Last_Close'].iloc[0]
    
    percent_diff = (last_close - first_close) / first_close * 100
    
    if percent_diff >= 5:
        trend = 'Up'
    elif percent_diff <= -5:
        trend = 'Down'
    else:
        trend = 'Flat'
    
    return pd.Series({'Stock_Trend': trend})

df_stock_trend = df_stock_data.groupby('Symbol', group_keys=False).apply(label_stock)

print(df_stock_trend)


In [None]:
stock_data_df= stock_data


In [None]:
stock_data_df.reset_index(inplace=True)

In [None]:
#calculating some features to train a random forest
monthly_data = stock_data_df.groupby('Symbol').resample('M', on='Date').agg({
    'Open': 'first',
    'High': 'max',
    'Low': 'min',
    'Close': 'last',
    'Adj Close': 'last',
    'Volume': 'sum'
}).reset_index()

monthly_data['Open_Prev_Month'] = monthly_data.groupby('Symbol')['Open'].shift(1)
monthly_data['High_Prev_Month'] = monthly_data.groupby('Symbol')['High'].shift(1)
monthly_data['Low_Prev_Month'] = monthly_data.groupby('Symbol')['Low'].shift(1)
monthly_data['Close_Prev_Month'] = monthly_data.groupby('Symbol')['Close'].shift(1)
monthly_data['Volume_Prev_Month'] = monthly_data.groupby('Symbol')['Volume'].shift(1)

monthly_data['Volume_Pct_Change_Prev_Month'] = (
    (monthly_data['Volume'] / monthly_data['Volume_Prev_Month'] - 1) * 100
)

#getting the avg open
monthly_data['Avg_Open'] = monthly_data.groupby('Symbol')['Open'].rolling(window=2).mean().reset_index(level=0, drop=True)
#getting avg close
monthly_data['Avg_Close'] = monthly_data.groupby('Symbol')['Close'].rolling(window=2).mean().reset_index(level=0, drop=True)

#getting the price range between max and low
monthly_data['Price_Range'] = monthly_data['High'] - monthly_data['Low']

#getting the price close of one month compared to the other month
monthly_data['Price_Momentum'] = monthly_data['Close'] - monthly_data['Close'].shift(1)
#getting the vol change between months
monthly_data['Volume_Change'] = monthly_data['Volume'] - monthly_data['Volume'].shift(1)
#avg vol change
monthly_data['Avg_Volume_Change'] = monthly_data['Volume_Change'].rolling(window=2).mean()

monthly_data = monthly_data.reset_index(drop=True)

print(monthly_data)

In [None]:
merged_df = monthly_data.merge(labels_df[['Symbol', 'Date', 'Monthly_Label']], on=['Symbol', 'Date'], how='left')


In [None]:
merged_df = merged_df[['Date','Volume_Pct_Change_Prev_Month',
       'Avg_Open', 'Avg_Close', 'Price_Range', 'Price_Momentum',
       'Volume_Change', 'Avg_Volume_Change', 'Monthly_Label']]

In [None]:
merged_df.sort_values(by='Date', inplace=True)

In [None]:
merged_df.set_index('Date',inplace=True)
merged_df.head()

In [None]:
merged_df.head()

In [None]:
merged_df.isna().sum()

In [None]:
merged_df.shape

In [None]:
merged_df.Price_Momentum.describe()

In [None]:
plt.hist(merged_df.Price_Range, bins=50)

In [None]:
merged_df.dropna(inplace=True)


In [None]:
merged_df.head()


In [None]:
# Manually define the encoding based on your domain knowledge
class_mapping = {
    'Down': 0,
    'Up': 1,
    'Price Shock': 2,
    'Flat': 3
}


# Map the values using the dictionary (case-insensitive)
merged_df['Monthly_Label'] = merged_df['Monthly_Label'].map(class_mapping)



In [None]:
merged_df.isna().sum()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split  # Add this import

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from datetime import datetime, timedelta

X = merged_df.loc[:,merged_df.columns != 'Monthly_Label']
scaler = StandardScaler()



y = merged_df.Monthly_Label

cutoff_date = pd.to_datetime('today') - pd.DateOffset(months=6)
current_date = datetime.now()

# Calculate the date 6 months ago
six_months_ago = current_date - timedelta(days=30 * 6)  

# Filter X and y data based on the cutoff date
X_past = X[X.index < cutoff_date]
y_past = y[y.index < cutoff_date]




In [None]:
X_future = X[X.index >= six_months_ago]
y_future = y[y.index >= six_months_ago]

In [None]:
y_future

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_past, y_past, test_size=0.3, random_state=42)

In [None]:

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'penalty': ['l1', 'l2'],  # Regularization type (L1 or L2)
}

logistic_regression = LogisticRegression(random_state=42)

grid_search = GridSearchCV(logistic_regression, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

best_logistic_regression = grid_search.best_estimator_
y_pred = best_logistic_regression.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

confusion = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{confusion}")

report = classification_report(y_test, y_pred)
print(f"Classification Report:\n{report}")

In [None]:
#TAKES SOME TIME TO RUN

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': [None, 10, 20],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['auto', 'sqrt']
}

random_search = RandomizedSearchCV(estimator=rf_classifier, param_distributions=param_dist, 
                                   n_iter=50, scoring='accuracy', cv=3, n_jobs=-1)
random_search.fit(X_train, y_train)

best_params = random_search.best_params_
print(f"Best Hyperparameters: {best_params}")

best_rf_classifier = random_search.best_estimator_
y_pred = best_rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy with Best Model: {accuracy}")


In [None]:
y_future_pred = best_rf_classifier.predict(X_future)
accuracy = accuracy_score(y_future, y_future_pred)
print(f"Accuracy with Best Model: {accuracy}")

confusion = confusion_matrix(y_future, y_future_pred)
print(f"Confusion Matrix:\n{confusion}")

report = classification_report(y_future, y_future_pred)
print(f"Classification Report:\n{report}")



In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
import numpy as np

n_splits = 5

model = RandomForestClassifier(random_state=42)

tscv = TimeSeriesSplit(n_splits=n_splits)

split_date = '2021-01-31'

X_train = X_resampled[X_resampled.index < split_date]
y_train = y_resampled[y_resampled.index < split_date]
X_future = X_resampled[X_resampled.index >= split_date]
y_future = y_resampled[y_resampled.index >= split_date]

accuracy_scores = []

for train_index, test_index in tscv.split(X_train):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    model.fit(X_train_fold, y_train_fold)
    predictions = model.predict(X_test_fold)

    accuracy = accuracy_score(y_test_fold, predictions)
    accuracy_scores.append(accuracy)

average_accuracy = np.mean(accuracy_scores)
print("Average Accuracy on Training Data:", average_accuracy)

param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': [None, 10, 20],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['auto', 'sqrt']
}

random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, 
                                   n_iter=50, scoring='accuracy', cv=tscv, n_jobs=-1)
random_search.fit(X_train, y_train)

best_params = random_search.best_params_
best_model = random_search.best_estimator_
print(f"Best Hyperparameters: {best_params}")

predictions_future = best_model.predict(X_future)

accuracy_future = accuracy_score(y_future, predictions_future)
print("Accuracy on Future Data:", accuracy_future)

confusion = confusion_matrix(y_future, predictions_future)
print(f"Confusion Matrix:\n{confusion}")

report = classification_report(y_future, predictions_future)
print(f"Classification Report:\n{report}")



In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from itertools import cycle
import numpy as np

colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'red', 'green', 'purple', 'pink'])

y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
n_classes = y_test_bin.shape[1]

fpr = {}
tpr = {}
roc_auc = {}

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], (y_pred == i).astype(int))
    roc_auc[i] = auc(fpr[i], tpr[i])

plt.figure(figsize=(8, 6))

for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2, label=f'ROC curve (AUC = {roc_auc[i]:.2f}) for Class {i}')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve for Multiclass')
plt.legend(loc='lower right')
plt.show()


In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")