In [58]:
def load_data_csv_by_file(path: str, sep: str, encoding: str) -> Dict[str, pd.DataFrame]:
    """
    Loads all CSV files from a directory and returns a dictionary of dataframes.

    Parameters:
        path (str): The path of the directory containing the CSV files.
        sep (str): The separator character used in the CSV files.
        encoding (str): The encoding of the CSV files.

    Returns:
        Dict[str, pd.DataFrame]: A dictionary where each key is the name of a CSV file (without the '.csv' extension)
                                  and each value is a pandas dataframe containing the data from the corresponding CSV file.
    """
    df_dic = {}
    for filename in os.listdir(path):
        if filename.endswith('.csv'):
            file_path = os.path.join(path, filename)
            df_dic[filename.replace('.csv','')] = pd.read_csv(file_path, sep=sep, encoding=encoding)
    return df_dic

In [57]:
def reverse_dict(dic:dict) -> dict:
    ''' 
        Arg: dic = {key:value}, value is type list;
        Return a new dictionary with the reverse original key:value;
    '''
    return {val: key for key,value in dic.items() for val in value}

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

def cross_val(X_train: list, y_train: list, n_splits: int) -> float:
    """
    Perform cross validation on the given dataset using StratifiedKFold method.
    
    Parameters:
        X_train (list): List of training data
        y_train (list): List of labels for the training data
        n_splits (int): Number of folds for cross validation
        
    Returns:
        float: Proportion of correct predictions
    """
    skfolds = StratifiedKFold(n_splits)
    for train_index, test_index in skfolds.split(X_train, y_train):
        clone_clf = clone(sgd_clf)
        X_train_folds = X_train[train_index]
        y_train_folds = y_train[train_index]
        X_test_fold = X_train[test_index]
        y_test_fold = y_train[test_index]

        clone_clf.fit(X_train_folds, y_train_folds)
        y_pred = clone_clf.predict(X_test_fold)
        n_correct = sum(y_pred == y_test_fold)
        
    return n_correct/len(y_pred)

In [None]:
import pandas as pd
import yfinance as yf

def extract_prices(stock: str, start_date: str, end_date: str) -> pd.DataFrame:
    """
    This function extracts the historical stock prices of a given stock from the start date to the end date using the yfinance library.
    It returns a Pandas DataFrame containing the stock prices.
    
    Parameters:
    stock (str): The stock symbol of the stock to extract prices for.
    start_date (str): The start date in the format 'YYYY-MM-DD' for which to extract the stock prices.
    end_date (str): The end date in the format 'YYYY-MM-DD' for which to extract the stock prices.
    
    Returns:
    pd.DataFrame: A DataFrame containing the extracted stock prices.
    """
    return yf.download(stock, start=start_date, end=end_date)

def to_csv(data: pd.DataFrame, path: str) -> None:
    """
    This function saves the data of a given DataFrame to a csv file at the specified path.
    It does not return anything.
    
    Parameters:
    data (pd.DataFrame): The DataFrame containing the data to save.
    path (str): The path and file name of the CSV file to save the data to.
    """
    data.to_csv(path, index=False)

In [None]:
import matplotlib
def plot_precision_recall_vs_threshold(precisions: list, recalls: list, thresholds:list ) -> None: 
    """"
    Essa função plota a precisão e a revocação em relação ao limiar (threshold) utilizando matplotlib. É importante notar que precisões e revocações devem ter o mesmo tamanho e ordem de limiares. A função usa as funções plt.plot, plt.xlabel, plt.legend, plt.ylim e plt.grid para plotar o gráfico e plt.show para mostrar o gráfico final.

    Parâmetros:
        precisions (list): lista de precisões, onde cada elemento representa a precisão para um determinado limiar.
        recalls (list): lista de revocações, onde cada elemento representa a revocação para um determinado limiar.
        thresholds (list): lista de limiares, usado como eixo x no gráfico.

    Retorna:
        plot imagem
    """
    
    plt.plot(thresholds, precisions[:-1], 'b--', label= 'Precision')
    plt.plot(thresholds, recalls[:-1], 'g-', label='Recall')
    plt.xlabel('Threshold')
    plt.legend(loc='center left')
    plt.ylim([0, 1])
    plt.grid('on')
    plt.show()

In [None]:
def plot_roc_curve(fpr,tpr, label=None):
    """
    plot_roc_curve(fpr, tpr, label=None)

    Essa função plota a curva ROC (Receiver Operating Characteristic) utilizando matplotlib. A curva ROC é uma representação gráfica da performance de um classificador binário, mostrando a relação entre a taxa de verdadeiros positivos (TPR) e a taxa de falsos positivos (FPR). A função usa as funções plt.plot, plt.xlabel, plt.ylabel, plt.grid e plt.show para plotar o gráfico e mostrar o gráfico final.

    Parâmetros:
        fpr (list): lista de taxas de falsos positivos.
        tpr (list): lista de taxas de verdadeiros positivos.
        label (str, opcional): rótulo para a curva ROC.

    Retorna:
        None
    """
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1], [0,1], 'k--')
    plt.plot([0,1,0,1])
    plt.axis([0,1,0,1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.grid('on')
    plt.show()

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

def plot_matrix_confusion(y_true, y_pred, model_class):
    """
    plot_matrix_confusion(y_true, y_pred, model_class)

    Essa função plota a matriz de confusão utilizando a função ConfusionMatrixDisplay do pacote sklearn.metrics. A matriz de confusão é uma representação gráfica das predições do modelo em relação aos valores verdadeiros. A função usa as funções confusion_matrix e ConfusionMatrixDisplay do pacote sklearn.metrics para plotar a matriz de confusão.

    Parâmetros:
        y_true (list): lista de valores verdadeiros.
        y_pred (list): lista de valores preditos pelo modelo.
        model_class (list): lista de classes do modelo, usadas para rotular as linhas e colunas da matriz de confusão.

    Retorna:
        None
    """
    cfm = confusion_matrix(y_true, y_pred)
    cfm_plot = ConfusionMatrixDisplay(confusion_matrix=cfm, display_labels=model_class)
    cfm_plot.plot()

In [25]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

def plot_learning_curves(model, X: list, y: list, n_test: float):
    """"
    Plot learning curves for a given model
    
    Parameters:
        model : The model to be trained and evaluated
        X : list of feature data
        y : list of labels for the feature data
        n_test : a float, the proportion of the data to use as the validation set.
        
    Returns: 
        None
    """
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=n_test)
    train_errors, val_errors = [], []
    for m in range(1, len(X_train)):
        model.fit(X_train[:m], y_train[:m])
        y_train_predict = model.predict(X_train[:m])
        y_val_predict = model.predict(X_val)
        train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
        val_errors.append(mean_squared_error(y_val, y_val_predict))
    plt.figure(figsize=(10,5))
    plt.plot(np.sqrt(train_errors), 'r-+', linewidth=2, label="train")
    plt.plot(np.sqrt(val_errors), 'b-', linewidth=3, label="val")
    plt.grid('on')
    plt.legend(loc="upper right", fontsize=14)   
    plt.xlabel("Training set size", fontsize=14) 
    plt.ylabel("RMSE", fontsize=14)              