In [1]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import uuid

# Function to check and convert values to float
def convert_to_float(value):
    try:
        if isinstance(value, str):
            return float(value.replace(',', ''))
        else:
            return float(value)
    except ValueError:
        # Handle any non-numeric values gracefully
        return np.nan

# Regular expression to remove the timezone name in parentheses
def remove_timezone_name(date_str):
    return re.sub(r'\s*\(.*\)$', '', date_str)


folder_path = f'C:/Raghav Proj/StockFinder/ZirodhaCsvFiles'    

def genarate_png(file_path):
    pattern = r'([A-Za-z]+) \(\d{17} _ \d{17}\)'
    script = re.search(pattern, file_path).group(1)

    uid = uuid.uuid4()
    output_file_path = f'C:\\Raghav Proj\\StockFinder\\Output\\{script}_{uid}.png'    

    # Load data from Excel file
    #file_path = f'C:\\Raghav Proj\\StockFinder\\{fileName}.csv'  # Update with the actual path to your Excel file
    df = pd.read_csv(file_path)

    # # Load data from CSV file
    # file_path = '/mnt/data/stock_data.csv'  # Update with the actual path to your CSV file
    # df = pd.read_csv(file_path)

    # Apply the regex function to the 'Date' column
    df['Date'] = df['Date'].apply(remove_timezone_name)

    # Define the correct date format for hourly data
    date_format = "%a %b %d %Y %H:%M:%S GMT%z"
    
    # Ensure the Date column is in datetime format
    df['Date'] = pd.to_datetime(df['Date'], format=date_format)

    # Apply the conversion function to the 'Close' column
    df['Close'] = df['Close'].apply(convert_to_float)

    # Sort data by date
    df = df.sort_values(by='Date')

    # Preprocess the 'Close' column
    #df['close '] = df['close '].str.replace(',', '').astype(float)

    # Extract the 'Date' and 'Close' columns
    dates = df['Date']
    closing_prices = df['Close']

    # Prepare the data for the regression model
    X = np.array(range(len(dates))).reshape(-1, 1)
    y = closing_prices.values

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 

    # Evaluate polynomial regression models up to degree 5
    max_degree = 7
    results = evaluate_polynomial_regression(X_train, y_train, X_test, y_test, max_degree)

    # Print the results
    print("Degree | Train MSE | Test MSE | Train R2 | Test R2")
    for degree, train_mse, test_mse, train_r2, test_r2 in results:
        print(f"{degree:^6} | {train_mse:.4f} | {test_mse:.4f} | {train_r2:.4f} | {test_r2:.4f}")

    # Select the best degree based on the test MSE or R2
    best_degree = min(results, key=lambda x: x[2])[0]  # Select based on test MSE

    print(f"Best polynomial degree: {best_degree}")

    # Fit and predict with the best degree
    poly = PolynomialFeatures(degree=best_degree)
    X_poly = poly.fit_transform(X)
    model = LinearRegression()
    model.fit(X_poly, y)

    # Predict future values
    future_days = 5
    X_future = np.array(range(len(dates), len(dates) + future_days)).reshape(-1, 1)
    X_future_poly = poly.transform(X_future)
    y_future = model.predict(X_future_poly)

    # Adding more closing prices (example)
    additional_dates = pd.date_range(dates.iloc[-1] + pd.Timedelta(days=1), periods=future_days, freq='B')
    additional_closing_prices = y_future  # For the sake of this example, use the predictions

    # Extend the original dates and closing prices with the new data
    extended_dates = pd.concat([dates, pd.Series(additional_dates)])
    extended_closing_prices = np.concatenate([y, additional_closing_prices])

    # Determine the range of closing prices for horizontal lines
    min_price = min(closing_prices.min(), additional_closing_prices.min())
    max_price = max(closing_prices.max(), additional_closing_prices.max())
    horizontal_lines = np.arange(start=np.floor(min_price / 5) * 5, stop=np.ceil(max_price / 5) * 5, step=5)
    # Extend dates for plotting future values
    #extended_dates = pd.date_range(dates.iloc[0], periods=len(dates) + future_days, freq='B')

    # Plotting the data
    plt.figure(figsize=(10, 6))
    plt.plot(dates, closing_prices, label='Historical Closing Prices')
    plt.plot(extended_dates, np.concatenate([y, y_future]), label='Predicted Closing Prices', linestyle='--')

    # Define specific closing prices for horizontal lines (example)
    #horizontal_lines = [100, 120, 140, 160]

    # Add horizontal lines
    # for price in horizontal_lines:
    #     plt.plot(extended_dates, np.full_like(extended_dates, price), '-', color='gray', linewidth=0.7)

    plt.xlabel('Date')
    plt.ylabel('Closing Price')
    plt.title(script + f' Historical and Predicted Closing Prices (Polynomial Degree {best_degree})')
    plt.legend()
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()

    # Save the plot as a PNG file
    plt.savefig(output_file_path, format='png')

    # Show the plot
    plt.show()

# Function to evaluate polynomial regression models with different degrees
def evaluate_polynomial_regression(X_train, y_train, X_test, y_test, max_degree):
    results = []
    for degree in range(1, max_degree + 1):
        # Transform features
        poly = PolynomialFeatures(degree)
        X_train_poly = poly.fit_transform(X_train)
        X_test_poly = poly.transform(X_test)
        
        # Train model
        model = LinearRegression()
        model.fit(X_train_poly, y_train)
        
        # Predict and evaluate
        y_train_pred = model.predict(X_train_poly)
        y_test_pred = model.predict(X_test_poly)
        
        # Calculate metrics
        train_mse = mean_squared_error(y_train, y_train_pred)
        test_mse = mean_squared_error(y_test, y_test_pred)
        train_r2 = r2_score(y_train, y_train_pred)
        test_r2 = r2_score(y_test, y_test_pred)
        
        results.append((degree, train_mse, test_mse, train_r2, test_r2))
        
    return results

# Read all CSV files in the folder
dfs = []
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        genarate_png(file_path)

AttributeError: 'NoneType' object has no attribute 'group'