# Mean Variance Portfolio Optimization

In [51]:
import pandas as pd
import numpy as np
from sklearn.covariance import *


## Import data

In [41]:
df_raw_all = pd.read_hdf("dow30_full_selected.h5")


In [42]:
df_raw_prices = df_raw_all[["date", "tic", "close"]].rename(columns={"close": "price"})


In [101]:
df_prices = df_raw_prices.pivot(index="date", columns="tic", values="price").loc["2009-01-02":"2020-01-01"]


## Generate returns from prices

In [103]:
# Define functions
class Returns:
    def __init__(self, price_data: pd.DataFrame):
        """
        Initializes the Returns class instance with asset price data

        Parameters
        ----------
        price_data : pd.DataFrame
            Price data of assets
        """
        self.price_matrix = price_data.values.T
        self.assets = price_data.columns
        self.index = price_data.index

    @staticmethod
    def return_formula(
        price_matrix: np.ndarray,
        index: pd.Index,
        roll: bool = True,
        window: int = 1,
        log: bool = False,
    ) -> tuple:
        """"
        Convert price data to return data

        Parameters
        ----------
        price_matrix : np.ndarray
            Matrix of prices
        index : pd.Index
            Dates time scale
        roll : bool, optional
            Rolling return, by default True
        int : int, optional
            Time interval, by default 1
        log : bool, optional
            Continuous or discrete, by default False
        
        Returns
        -------
        tuple
            Returns a tuple of (array of returns, dates)
        """
        step = 1 if roll else window
        shift = window

        return (
            (
                np.log(
                    ((price_matrix / np.roll(price_matrix, shift=shift, axis=1)) - 1)
                )[:, shift::step],
                index[shift::step],
            )
            if log
            else (
                ((price_matrix / np.roll(price_matrix, shift=shift, axis=1)) - 1)[
                    :, shift::step
                ],
                index[shift::step],
            )
        )

    def compute_returns(self, method: str, **kwargs) -> pd.DataFrame:
        """
        Calculates asset returns based on defined method and parameters

        Parameters
        ----------
        method: str
            Options = ["daily", "rolling", "collapse"]
            daily: calculates daily percentage change
            rolling: calculates rolling percentage change based on window, user passes in a parameter window=?
            collapse: calculates percentage change based on window, user passes in a parameter window=?
                e.g.: if window=22, output is the return between each 22 day interval from the beginning
            Aditional option: calculates continuous return by passing in log=True, or discrete otherwise
        **kwargs: arguments passed into return_formula()
            
        Returns
        -------
        pd.DataFrame
            Returns a pandas DataFrame of asset returns
        """
        price_matrix = self.price_matrix
        index = self.index

        if method == "daily":
            return_matrix, return_idx = Returns.return_formula(
                price_matrix, index, window=1, roll=True, **kwargs
            )
        elif method == "rolling":
            return_matrix, return_idx = Returns.return_formula(
                price_matrix, index, roll=True, **kwargs
            )
        elif method == "collapse":
            return_matrix, return_idx = Returns.return_formula(
                price_matrix, index, roll=False, **kwargs
            )
        else:
            print(
                "What is going on? Invalid method! Valid Inputs: daily, rolling, collapse"
            )

        return pd.DataFrame(return_matrix.T, columns=self.assets, index=return_idx)

    def compute_mean_return(
        self, method: str, time_scaling: int = 252, **kwargs
    ) -> pd.Series:
        """
        Calculates mean historical asset returns to be used in mean-variance optimizer

        Parameters
        ----------
        method: str
            Options = ["arithmetic", "geometric"]
            arithmetic: Calculates the arithmetic mean of return, all paramters in compute_returns() can be passed in as additional arguments
            geometric: Calculates the geometric mean from first to last observation

        time_scaling: int, optional
            Annualizes daily mean return, by default 252

        **kwargs: additional arguments if using arithmetic

        Returns
        -------
        pd.Series
            Returns a pandas Series of asset mean returns
        """
        price_matrix = self.price_matrix
        index = self.index

        return_matrix, return_idx = Returns.return_formula(
            price_matrix, index, **kwargs
        )

        if method == "arithmetic":
            mean_return = np.mean(return_matrix, axis=1) * time_scaling
        elif method == "geometric":
            mean_return = (
                (price_matrix[:, -1] / price_matrix[:, 0])
                ** (time_scaling / price_matrix.shape[1])
            ) - 1
        else:
            print(
                "What is going on? Invalid method! Valid Inputs: arithmetic, geometric"
            )

        return pd.Series(mean_return, index=self.assets)

In [104]:
# Call functions
returns_generator = Returns(df_prices)
df_returns = returns_generator.compute_returns(method="daily")
mu_return_geom = returns_generator.compute_mean_return(method="geometric")
mu_return_arit = returns_generator.compute_mean_return(method="arithmetic")

## Compute covariance matrix

In [109]:
# Define functions
class Risks:
    def __init__(self, returns_data: pd.DataFrame):
        """
        Initializes the Risks class instance with asset returns data

        Parameters
        ----------
        returns_data : pd.DataFrame
            Price data of assets
        """
        self.return_matrix = returns_data.values.T
        self.assets = returns_data.columns

    def semi_cov(
        self,
        return_matrix: np.ndarray,
        bm_return: float = 0.0001,
        assume_zero: bool = False,
    ) -> np.ndarray:
        """_summary_

        Parameters
        ----------
        return_matrix : np.ndarray
            Array of returns
        bm_return : float, optional
            Ignores all individual asset returns above the bm_return when calculating covariance, by default 0.0001
        assume_zero : bool, optional
            Long term daily mean return for an individual asset is sometimes assumed to be 0, by default False

        Returns
        -------
        np.ndarray
            Matrix of returns to use in semi-covariance estimation
        """
        return_matrix_copy = return_matrix.copy()

        def adjust_return_vec(return_vec: np.ndarray, bm_return: float):
            return_vec[return_vec >= bm_return] = np.mean(
                return_vec[return_vec < bm_return]
            )

            return return_vec

        return_matrix_copy = (
            np.fmin(return_matrix_copy - bm_return, 0)
            if assume_zero
            else np.apply_along_axis(
                adjust_return_vec, axis=1, arr=return_matrix_copy, bm_return=bm_return
            )
        )

        return return_matrix_copy

    def construct_weights(self, return_matrix: np.ndarray) -> np.ndarray:
        """_summary_

        Parameters
        ----------
        return_matrix : np.ndarray
            Array of returns

        Returns
        -------
        np.ndarray
            Returns array of weights for each asset (can be not all equal, in progress)
        """

        return np.repeat(
            np.divide(1, return_matrix).shape[1], repeats=return_matrix.shape[1]
        )

    @staticmethod
    def find_cov(
        return_matrix: np.ndarray, weight_factor: float, builtin: bool
    ) -> np.ndarray:
        """_summary_

        Parameters
        ----------
        return_matrix : np.ndarray
            Array of returns
        weight_factor : float
            _description_
        builtin : bool
            If True then calls np.cov() to calculate, otherwise use matrix calculation method written in the class

        Returns
        -------
        np.ndarray
            _description_
        """
        return np.cov(return_matrix, aweights=weight_factor)

    def sample_cov(
        self,
        return_matrix: np.ndarray,
        unit_time: int,
        weights: np.ndarray = None,
        builtin: bool = False,
        **kwargs
    ):
        """_summary_

        Parameters
        ----------
        return_matrix : np.ndarray
            _description_
        unit_time : int
            _description_
        weights : np.ndarray, optional
            _description_, by default None
        builtin : bool, optional
            If True then calls np.cov() to calculate, otherwise use matrix calculation method written in the class, by default False

        Returns
        -------
        _type_
            _description_
        """
        weights = self.construct_weights(return_matrix)

        return Risks.find_cov(return_matrix, weights, builtin) * unit_time

    def scikit_cov_technique(
        self, return_mat: np.ndarray, technique: str, time_scaling: int = 252, **kwargs
    ) -> np.ndarray:
        """
        Using sklearn.covariance methods to construct covariance matrix

        Parameters
        ----------
        return_mat : np.ndarray
            Array of returns
        technique : str
            Options to select sklearn.covariance methods
        time_scaling : int, optional
            Annualize covariance matrix (assuming daily input), by default 252

        Returns
        -------
        np.ndarray
            Returns covariance matrix in it's raw form
        """
        technique_dict = {
            "EmpiricalCovariance": EmpiricalCovariance,
            "EllipticEnvelope": EllipticEnvelope,
            "GraphicalLasso": GraphicalLasso,
            "GraphicalLassoCV": GraphicalLassoCV,
            "LedoitWolf": LedoitWolf,
            "MinCovDet": MinCovDet,
            "OAS": OAS,
            "ShrunkCovariance": ShrunkCovariance,
        }

        try:
            return (
                technique_dict[technique](**kwargs).fit(return_mat.T).covariance_
                * time_scaling
            )
        except KeyError:
            print(
                "What is going on? Invalid technique! Valid inputs: EmpiricalCovariance, EllipticEnvelope, GraphicalLasso, GraphicalLassoCV, LedoitWolf, MinCovDet, OAS, ShrunkCovariance"
            )

    def compute_cov_matrix(
        self,
        technique: str = "sample",
        semi: bool = False,
        time_scaling: int = 252,
        builtin: bool = False,
        weights: np.ndarray = None,
        bm_return: float = 0.00025,
        assume_zero: bool = False,
        normalize: bool = False,
        **kwargs
    ) -> pd.DataFrame:
        """_summary_

        Parameters
        ----------
        technique : str, optional
            additional_options: ["EmpiricalCovariance", "EllipticEnvelope", "GraphicalLasso", "GraphicalLassoCV",
                                    "LedoitWolf", "MinCovDet", "OAS", "ShrunkCovariance"]
            Specifies the calculation technique for the covariance matrix, by default "sample"
        semi : bool, optional
            If True, returns a semivariance matrix that emphasizes on downside portfolio variance, by default False
        time_scaling : int, optional
            Default annualizes the covariance matrix (assuming daily return is the input), by default 252
        builtin : bool, optional
            If True then calls np.cov() to calculate, otherwise use matrix calculation method written in the class, by default False
        weights : np.ndarray, optional
            _description_, by default None
        bm_return : float, optional
            dditional parameter for calculating semivariance matrix, by default 0.00025
        assume_zero : bool, optional
            Long term daily mean return for an individual asset is sometimes assumed to be 0, by default False
        normalize : bool, optional
            To normalize the covariance matrix. In the specific case for covariance matrix, a normalized covariance
            matrix is a correlation matrix, by default False

        Returns
        -------
        pd.DataFrame
            Returns the covariance matrix as a pandas DataFrame
        """
        return_matrix = self.return_matrix

        if semi:
            return_matrix = self.semi_cov(
                return_matrix, bm_return=bm_return, assume_zero=assume_zero
            )

        if technique == "sample":
            cov_matrix = self.sample_cov(
                return_matrix, time_scaling, builtin=builtin, weights=weights, **kwargs
            )
        else:
            cov_matrix = self.scikit_cov_technique(
                return_matrix, technique, time_scaling, **kwargs
            )

        if normalize:
            cov_mat = cov_mat * np.dot(
                ((np.diag(cov_mat)) ** -0.5).reshape(-1, 1),
                ((np.diag(cov_mat)) ** -0.5).reshape(1, -1),
            )

        return pd.DataFrame(cov_matrix, index=self.assets, columns=self.assets)


In [111]:
cov_generator = Risks(df_returns)
cov_matrix = cov_generator.compute_cov_matrix()

  np.divide(1, return_matrix).shape[1], repeats=return_matrix.shape[1]


## Construct optimizer

## Define objective function

## Define portfolio constraints

## Solve and check summary