In [1]:
import pandas as pd

### Cholesky decomposition

https://github.com/guillaumeguy/notebooks/blob/main/Correlated_notebooks.ipynb


In [44]:
_N = 1000 # the desired sample size
_corr = 0.5 # the desired correlation
_N_DIST = 10

# generate random normal distribution
acc = []
for i in range(_N_DIST):
    acc.append(np.reshape(np.random.normal(0,1,_N),(_N,-1)))
    
# Compute 
all_norm = np.concatenate(acc,axis=1)

print(all_norm.shape)

def generate_cov(dim):
    acc  = []
    for i in range(dim):
        row = np.ones((1,dim)) * _corr
        row[0][i] = 1
        acc.append(row)
    return np.concatenate(acc,axis=0)

cov = generate_cov(_N_DIST)
C = np.linalg.cholesky(cov)

Y = np.transpose(0 + np.dot(C, np.transpose(all_norm)))

print("correlation:\n")

np.round(np.corrcoef(Y,rowvar=False),2)

(1000, 10)
correlation:



array([[1.  , 0.5 , 0.5 , 0.49, 0.52, 0.51, 0.51, 0.5 , 0.51, 0.51],
       [0.5 , 1.  , 0.51, 0.48, 0.5 , 0.5 , 0.53, 0.5 , 0.53, 0.49],
       [0.5 , 0.51, 1.  , 0.5 , 0.48, 0.49, 0.53, 0.49, 0.5 , 0.5 ],
       [0.49, 0.48, 0.5 , 1.  , 0.47, 0.47, 0.49, 0.53, 0.49, 0.49],
       [0.52, 0.5 , 0.48, 0.47, 1.  , 0.46, 0.52, 0.5 , 0.52, 0.52],
       [0.51, 0.5 , 0.49, 0.47, 0.46, 1.  , 0.52, 0.52, 0.5 , 0.5 ],
       [0.51, 0.53, 0.53, 0.49, 0.52, 0.52, 1.  , 0.51, 0.49, 0.51],
       [0.5 , 0.5 , 0.49, 0.53, 0.5 , 0.52, 0.51, 1.  , 0.51, 0.53],
       [0.51, 0.53, 0.5 , 0.49, 0.52, 0.5 , 0.49, 0.51, 1.  , 0.52],
       [0.51, 0.49, 0.5 , 0.49, 0.52, 0.5 , 0.51, 0.53, 0.52, 1.  ]])

### generate correlated columns

https://stackoverflow.com/questions/42902938/create-correlated-pandas-series

In [36]:
from random import randint
from scipy.stats import pearsonr

def fun(x):
    if np.std(x) >= 5:
        return np.std(x)
    return abs(0.8 - pearsonr(data['A'], x)[0])



In [42]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize

data = pd.DataFrame({'A': [10, 11, 10, 9]})

data['B'] = minimize(fun, [randint(5, 25) for _ in range(4)], method = 'SLSQP', bounds = [(5, 25) for _ in range(4)]).x

data['C'] = minimize(lambda x: abs(0.8 - pearsonr(data['A'], x)[0]), np.random.rand(len(data['A']))).x


print(data.cov(), data.mean())
print(data)

          A          B         C
A  0.666667   3.769366  0.457544
B  3.769366  33.300287  1.263944
C  0.457544   1.263944  0.490655 A    10.000000
B    15.983210
C     0.486295
dtype: float64
    A          B         C
0  10  12.739099  1.049307
1  11  20.806738  1.118646
2  10  20.888363  0.031212
3   9   9.498639 -0.253984


### from raw data

Y = mean + C.Z
where 
Z = vector of uncorrelated Gaussian random variables, 
C.C_transpose = covariance matrix; To obtain C You may use Cholesky decomposition.


https://stats.stackexchange.com/questions/255288/how-to-generate-time-series-with-correlation-and-autocorrelation/258140

https://math.stackexchange.com/questions/446093/generate-correlated-normal-random-variables

https://stackoverflow.com/questions/15180515/numpy-random-multivariate-normalmean-cov-size?rq=1

### from existing dataframe

https://stackoverflow.com/questions/58440799/simulate-time-series-satisfying-a-covariance-matrix

https://stats.stackexchange.com/questions/15257/literature-on-generating-similar-synthetic-time-series-from-observed-time-seri

In [31]:
import numpy as np

def get_new_series_with_covariance(df):
    mean = df.mean()  
    cov = df.cov().T
    dataNew = pd.DataFrame(data=np.random.multivariate_normal(mean, cov, 10), columns=df.columns)
    return dataNew

In [32]:
df = pd.DataFrame(np.random.randint(0, 100, size=(10, 4)), columns=list('ABCD'))
print(df.mean())
print(df.cov())
df

A    45.8
B    38.5
C    69.5
D    56.7
dtype: float64
            A           B           C           D
A  772.622222  228.333333 -373.777778  291.155556
B  228.333333  516.944444 -293.388889   34.277778
C -373.777778 -293.388889  775.166667 -468.500000
D  291.155556   34.277778 -468.500000  986.011111


Unnamed: 0,A,B,C,D
0,50,24,87,96
1,53,69,69,22
2,8,15,97,39
3,27,41,63,23
4,15,40,53,92
5,69,60,91,40
6,89,18,35,74
7,59,26,86,60
8,16,16,98,23
9,72,76,16,98


In [33]:
synth_df = get_new_series_with_covariance(df)
print(synth_df.mean())
print(synth_df.cov())
synth_df

A    42.155717
B    32.527531
C    64.950746
D    55.857507
dtype: float64
            A            B           C           D
A  618.688735   680.142938 -127.121861 -219.210740
B  680.142938  1010.345459  -92.851034 -451.362837
C -127.121861   -92.851034  526.122874 -475.863472
D -219.210740  -451.362837 -475.863472  815.837933


Unnamed: 0,A,B,C,D
0,68.472747,57.156896,41.869139,72.967163
1,42.178004,20.579235,75.259331,71.317209
2,13.908759,-18.732136,41.579845,108.666759
3,73.041362,63.720083,82.211976,9.386664
4,31.80806,37.732341,80.553851,31.302106
5,18.510974,30.872672,92.667076,31.536402
6,30.033065,38.778228,32.658314,76.463521
7,17.287799,-17.913377,76.931742,59.603238
8,43.032722,34.441312,85.750978,40.732383
9,83.283679,78.640061,40.025209,56.599627
