In [None]:
# https://towardsdatascience.com/pandas-groupby-a-simple-but-detailed-tutorial-314b8f37005d
# https://towardsdatascience.com/accessing-data-in-a-multiindex-dataframe-in-pandas-569e8767201d
# https://towardsdatascience.com/summarizing-data-with-pandas-crosstab-efc8b9abecf
# https://towardsdatascience.com/how-to-flatten-multiindex-columns-and-rows-in-pandas-f5406c50e569
# https://datascientyst.com/list-aggregation-functions-aggfunc-groupby-pandas/
# https://datascientyst.com/sort-multiindex-pandas/
# https://datascientyst.com/list-aggregation-functions-aggfunc-groupby-pandas/
#### pandas function basics ####
# https://pandas.pydata.org/pandas-docs/stable/user_guide/basics.html#function-application

In [20]:
# pandas apply function to df
# https://stackoverflow.com/questions/13331698/how-to-apply-a-function-to-two-columns-of-pandas-dataframe
# df['col_3'] = df.apply(lambda x: f(x.col_1, x.col_2), axis=1)

import pandas as pd

df = pd.DataFrame({'ID':['1', '2', '3'], 'col_1': [0, 2, 3], 'col_2':[1, 4, 5]})
mylist = ['a', 'b', 'c', 'd', 'e', 'f']

def get_sublist(sta,end):
    return mylist[sta:end+1]

df['col_3'] = df.apply(lambda x: get_sublist(x.col_1, x.col_2), axis=1)
df

Unnamed: 0,ID,col_1,col_2,col_3
0,1,0,1,"[a, b]"
1,2,2,4,"[c, d, e]"
2,3,3,5,"[d, e, f]"


In [21]:
# pandas pipe 
import pandas as pd

def extract_city_name(df):
    """
    Chicago, IL -> Chicago for city_name column
    """
    df["city_name"] = df["city_and_code"].str.split(",").str.get(0)
    return df


def add_country_name(df, country_name=None):
    """
    Chicago -> Chicago-US for city_name column
    """
    col = "city_name"
    df["city_and_country"] = df[col] + country_name
    return df


df_p = pd.DataFrame({"city_and_code": ["Chicago, IL"]})
df_p.pipe(extract_city_name).pipe(add_country_name, country_name="US")
df_p

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoUS


In [None]:
# put dict with variable_name:variable_value pairs into memory
# convert dictionary entries into variables
# https://stackoverflow.com/questions/18090672/convert-dictionary-entries-into-variables?noredirect=1&lq=1
d = {'a': 1, 'b': 2}
locals().update(d)
globals().update(d)  # works inside a function
print(f'a: {a}')


In [None]:
# pd pandas display options
pd.set_option("display.max_rows", 10)
pd.set_option("display.max_columns", 30)
pd.set_option("display.max_colwidth", 20)
pd.set_option("display.width", 500)  # code-runner format

In [None]:
# https://stackoverflow.com/questions/4019639/splitting-a-dictionary-in-python-into-keys-and-values
# dic dict create dictionary from list of keys and values, exact dic keys and values to separate lists
# store pickle run results in dictionary dic dict for future verification test
# use notepad find / replace on _vals to add "" around _vals to get _keys
_keys = ['df_eval_results', 'df', 'max_lookback_slices']
_vals = [df_eval_results, df, max_lookback_slices]
_d = dict(zip(_keys, _vals))
_l_keys = list(_d.keys())
_l_vals = list(_d.values())
print(_l_keys)
print(_l_vals)




In [None]:
# sort sorted unchanged old list new list 
old_list = [[3,2,1], [6,5,4]]
new_list = []
for i in old_list:
    new_list.append(sorted(i))

print(f'old_list: {old_list}')
print(f'new_list: {new_list}')

In [1]:
# format fsting f spacing string
print(f'{"hello":<14}{"good bye":>14}')
print(f'{"%"*10}')
print(f'\n\n{"%"*40:<42}{i+1} of {len(l_sorted_days_lookbacks)} days_lookbacks{"%"*40:>42}\n')

hello               good bye
%%%%%%%%%%


In [None]:
# zip two lists together into dic When you need to create dictionaries from lists, I assume you have the list of keys in a list and the list of matching values in another one:
keys_sample = ["left", "right", "up", "down"]
values_sample = ["one", "two", 0, None]
# dictionary constructor takes pairs, which are the output of zip:
together = dict(zip(keys_sample, values_sample))

In [None]:
# fstring keep a consistent spacing format 
blank = ' '
print(f'\n{my_date:<14}{str(my_days_lookback):<21}{my_col}: {my_list}')
print(f'{blank:<14}{str(my_days_lookback):<21}{my_col}: {my_list}')    

In [None]:
# df filter multiple conditions requirement and or 
# Cannot perform 'rand_' with a dtyped [object] array and scalar of type [bool]
# The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
my_row = (df['date_end_df_train'] == my_date) & (df['days_lookbacks'] == '[15, 30, 60, 120]')

In [3]:
# fsting format and justify list
my_list = [20, 60, 120]
print(f'my_list: {str(my_list):>15}')

my_list:   [20, 60, 120]


In [1]:
# https://stackoverflow.com/questions/23111990/pandas-dataframe-stored-list-as-string-how-to-convert-back-to-list
# convert list stored in dataframe df back to a list
# Pandas DataFrame stored list as string: How to convert back to list
from ast import literal_eval
my_str = "['a', 'b']"
my_list = literal_eval(my_str)
print(my_list)
print(type(my_list))
my_list

['a', 'b']
<class 'list'>


['a', 'b']

In [None]:
# reset index pandas drop index column
df = df.reset_index(drop=True)

In [None]:
# https://stackoverflow.com/questions/71757911/select-multiple-separately-columns-by-iloc-in-pandas
g_tbl.iloc[:, np.r_[1, 16:42]].head(10)

In [None]:
# https://www.geeksforgeeks.org/how-to-replace-values-in-column-based-on-condition-in-pandas/
gdf.loc[gdf['run_type'] == 'train', 'run_type'] = 100

# returns the row labels where column 'run_type' equal 'train'
tbl_concat.loc[tbl_concat['run_type'] == 'train']

In [15]:
# https://stackoverflow.com/questions/62697810/replace-specific-values-in-multiindex-dataframe
import pandas as pd
df = pd.DataFrame({'col1': [14., 4., 1., 4., 1., 1., 3.],
                   'col2': [ 87.346878, 87.347504, 123.110001, 209.612503, 68.540001, 64.370003, 75.]},
                   index = pd.MultiIndex.from_tuples(([('A', 1, '2017-04-01'), ('A', 1, '2017-06-01'),
                                                       ('A', 2, '2014-08-01'), ('A', 2, '2015-01-01'),
                                                       ('B', 3, '2014-07-01'), ('B', 3, '2014-12-01'),
                                                       ('B', 4, '2015-01-01')])))
_df = df.copy()
print(df,'\n')
print('='*15)

grouped_df = df.groupby(level=[0,1])
for key, item in grouped_df:
    print(grouped_df.get_group(key), "\n")
print('='*15)

print(f'df.groupby(level=[0,1]).indices.values():\n{df.groupby(level=[0,1]).indices.values()}')
print('='*15)

print(f'df.iloc[[0,2,4,6]]:\n{df.iloc[[0,2,4,6]]}')
print('='*15)

df.iloc[[0,2,4,6]]=0
print(f'df.iloc[[0,2,4,6]]=0:\n{df}')
print('='*15)

_df.iloc[[a[0] for a in _df.groupby(level=[0, 1]).indices.values()]] = 0
print(f'df.iloc[[a[0] for a in df.groupby(level=[0, 1]).indices.values()]] = 0:\n{_df}')

                col1        col2
A 1 2017-04-01  14.0   87.346878
    2017-06-01   4.0   87.347504
  2 2014-08-01   1.0  123.110001
    2015-01-01   4.0  209.612503
B 3 2014-07-01   1.0   68.540001
    2014-12-01   1.0   64.370003
  4 2015-01-01   3.0   75.000000 

                col1       col2
A 1 2017-04-01  14.0  87.346878
    2017-06-01   4.0  87.347504 

                col1        col2
A 2 2014-08-01   1.0  123.110001
    2015-01-01   4.0  209.612503 

                col1       col2
B 3 2014-07-01   1.0  68.540001
    2014-12-01   1.0  64.370003 

                col1  col2
B 4 2015-01-01   3.0  75.0 

df.groupby(level=[0,1]).indices.values():
dict_values([array([0, 1], dtype=int64), array([2, 3], dtype=int64), array([4, 5], dtype=int64), array([6], dtype=int64)])
df.iloc[[0,2,4,6]]:
                col1        col2
A 1 2017-04-01  14.0   87.346878
  2 2014-08-01   1.0  123.110001
B 3 2014-07-01   1.0   68.540001
  4 2015-01-01   3.0   75.000000
df.iloc[[0,2,4,6]]=0:
         

In [None]:
# print groupby object pandas, level 0,1,2 happens to be the row indices
grouped_df = tbl_concat.groupby(level=[0,1,2])

for key, item in grouped_df:
    print(grouped_df.get_group(key), "\n\n")

In [None]:
# list sorted unique symbols
my_unique_symbols = sorted(list(set(my_symbols)))
my_unique_symbols = sorted(list(set(my_symbols)), reverse=True)  # descending

In [None]:
# https://stackoverflow.com/questions/17071871/how-do-i-select-rows-from-a-dataframe-based-on-column-values

In [None]:
# format number
print(f'symbol:  {symbol:>4}   Date: {sDate:13}df_Close: {sClose:>10.5f} Yahoo_Close: {yhClose:>10.5f}   %_dif_Close: {abs_pct_diff:>7.5f}')

In [1]:
# pandas iloc slice, first and last n rows
import pandas as pd
import numpy as np
# create a range of dates
dates = pd.date_range(start='2022-01-01', end='2022-01-10')
# create a DataFrame with dates as the index
df = pd.DataFrame(index=dates)
# add a column of integers to the DataFrame
df['col'] = np.arange(1, 11, 1, dtype=int)

print(f'len(df): {len(df)}')
print(f'df:\n{df}\n')

print(f'Return the first row')
print(f'df.head(1):\n{df.head(1)}\n')
print(f'df.iloc[-len(df)] returns the first row as a series:\n{df.iloc[-len(df)]}\n')
print(f'df.iloc[-len(df) : -len(df)+1] returns the first row as a df:\n{df.iloc[-len(df) : -len(df)+1]}\n')
print(f'df.iloc[0] returns the first row as a series:\n{df.iloc[0]}\n')
print(f'df.iloc[[0]] returns the first row as a df:\n{df.iloc[[0]]}\n')
print(f'df.iloc[0:1] returns the first row as a df:\n{df.iloc[0:1]}\n')

print(f'Return the last row')
print(f'df.tail(1):\n{df.tail(1)}\n')
print(f'df.iloc[len(df)-1] returns the last row as a series:\n{df.iloc[len(df)-1]}\n')
print(f'df.iloc[len(df)-1 : len(df)] returns the last row as a df:\n{df.iloc[len(df)-1 : len(df)]}\n')
print(f'df.iloc[-1] returns the last row as a series:\n{df.iloc[-1]}\n')
print(f'df.iloc[[-1]] returns the last row as a df:\n{df.iloc[[-1]]}\n')
print(f'df.iloc[-1:] returns the last row as a df:\n{df.iloc[-1:]}\n')

print(f'Return a slice')
print(f'df.iloc[1:2] returns the second row as a df:\n{df.iloc[1:2]}\n')
print(f'df.iloc[-2:-1] returns the second to the last row as a df:\n{df.iloc[-2:-1]}\n')
print(f'df.iloc[:2] returns the first two rows as a df:\n{df.iloc[:2]}\n')
print(f'df.iloc[-2:] returns the last two rows as a df:\n{df.iloc[-2:]}\n')

print(f'Return all rows')
print(f'df returns the all rows as a df:\n{df}\n')
print(f'df.iloc[0 : len(df)] returns the all rows as a df:\n{df.iloc[0 : len(df)]}\n')
print(f'df.iloc[0 : len(df)+1] returns the all rows as a df without warning out-of-bound:\n{df.iloc[0 : len(df)+1]}\n')
print(f'df.iloc[-len(df):] returns all rows as a df:\n{df.iloc[-len(df):]}\n')

print(f'ERROR: df.iloc[len(df)] will raise IndexError: single positional indexer is out-of-bounds')
print(f'{df.iloc[len(df)]}')


len(df): 10
df:
            col
2022-01-01    1
2022-01-02    2
2022-01-03    3
2022-01-04    4
2022-01-05    5
2022-01-06    6
2022-01-07    7
2022-01-08    8
2022-01-09    9
2022-01-10   10

Return the first row
df.head(1):
            col
2022-01-01    1

df.iloc[-len(df)] returns the first row as a series:
col    1
Name: 2022-01-01 00:00:00, dtype: int32

df.iloc[-len(df) : -len(df)+1] returns the first row as a df:
            col
2022-01-01    1

df.iloc[0] returns the first row as a series:
col    1
Name: 2022-01-01 00:00:00, dtype: int32

df.iloc[[0]] returns the first row as a df:
            col
2022-01-01    1

df.iloc[0:1] returns the first row as a df:
            col
2022-01-01    1

Return the last row
df.tail(1):
            col
2022-01-10   10

df.iloc[len(df)-1] returns the last row as a series:
col    10
Name: 2022-01-10 00:00:00, dtype: int32

df.iloc[len(df)-1 : len(df)] returns the last row as a df:
            col
2022-01-10   10

df.iloc[-1] returns the last row

IndexError: single positional indexer is out-of-bounds

In [2]:
df.index[10:10]

DatetimeIndex([], dtype='datetime64[ns]', freq='D')

In [16]:
# print(df.head(1).index[0].strftime('%Y-%m-%d'))
# print(df.tail(1).index[0].strftime('%Y-%m-%d'))
print(df.index[0].strftime('%Y-%m-%d'))
print(df.index[-1].strftime('%Y-%m-%d'))

2022-01-01
2022-01-10


In [78]:
print(f'len(df): {len(df)}\ndf:\n{df}\n')
# df.index[9].strftime('%Y-%m-%d')
len_df = len(df)

for iloc_pos in np.arange(0, 13, 1, dtype=int): 
  if iloc_pos < len_df:
    _date = df.index[iloc_pos].strftime('%Y-%m-%d')  
    print(f'iloc_pos: {iloc_pos}, date: {_date}')
  elif iloc_pos == len_df:
    _date = df.index[len_df - 1].strftime('%Y-%m-%d')      
    print(f'iloc_pos: {len_df}, date: {_date}')  
  else:
    msg_stop = f'ERROR iloc_pos {iloc_pos} must be <= len(df) {len(df)}' 
    raise SystemExit(msg_stop )

len(df): 10
df:
            col
2022-01-01    1
2022-01-02    2
2022-01-03    3
2022-01-04    4
2022-01-05    5
2022-01-06    6
2022-01-07    7
2022-01-08    8
2022-01-09    9
2022-01-10   10

iloc_pos: 0, date: 2022-01-01
iloc_pos: 1, date: 2022-01-02
iloc_pos: 2, date: 2022-01-03
iloc_pos: 3, date: 2022-01-04
iloc_pos: 4, date: 2022-01-05
iloc_pos: 5, date: 2022-01-06
iloc_pos: 6, date: 2022-01-07
iloc_pos: 7, date: 2022-01-08
iloc_pos: 8, date: 2022-01-09
iloc_pos: 9, date: 2022-01-10
iloc_pos: 10, date: 2022-01-10


SystemExit: ERROR iloc_pos 11 must be <= len(df) 10

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
# file name with today's date
str_today = datetime.date.today().strftime('%Y-%m-%d')
fp_results = f'yf_8_results_{str_today}'

In [None]:
# https://stackoverflow.com/questions/19798229/how-to-do-group-by-on-a-multiindex-in-pandas
import pandas
df = pandas.DataFrame.from_dict(
    {
     'category': {0: 'Love', 1: 'Love', 2: 'Fashion', 3: 'Fashion', 4: 'Hair', 5: 'Movies', 6: 'Movies', 7: 'Health', 8: 'Health', 9: 'Celebs', 10: 'Celebs', 11: 'Travel', 12: 'Weightloss', 13: 'Diet', 14: 'Bags'}, 
     'impressions': {0: 380, 1: 374242, 2: 197, 3: 13363, 4: 4, 5: 189, 6: 60632, 7: 269, 8: 40189, 9: 138, 10: 66590, 11: 2227, 12: 22668, 13: 21707, 14: 229}, 
     'date': {0: '2013-11-04', 1: '2013-11-04', 2: '2013-11-04', 3: '2013-11-04', 4: '2013-11-04', 5: '2013-11-04', 6: '2013-11-04', 7: '2013-11-04', 8: '2013-11-04', 9: '2013-11-04', 10: '2013-11-04', 11: '2013-11-04', 12: '2013-11-04', 13: '2013-11-04', 14: '2013-11-04'}, 'cpc_cpm_revenue': {0: 0.36823, 1: 474.81522000000001, 2: 0.19434000000000001, 3: 18.264220000000002, 4: 0.00080000000000000004, 5: 0.23613000000000001, 6: 81.391139999999993, 7: 0.27171000000000001, 8: 51.258200000000002, 9: 0.11536, 10: 83.966859999999997, 11: 3.43248, 12: 31.695889999999999, 13: 28.459320000000002, 14: 0.43524000000000002}, 'clicks': {0: 0, 1: 183, 2: 0, 3: 9, 4: 0, 5: 1, 6: 20, 7: 0, 8: 21, 9: 0, 10: 32, 11: 1, 12: 12, 13: 9, 14: 2}, 'size': {0: '300x250', 1: '300x250', 2: '300x250', 3: '300x250', 4: '300x250', 5: '300x250', 6: '300x250', 7: '300x250', 8: '300x250', 9: '300x250', 10: '300x250', 11: '300x250', 12: '300x250', 13: '300x250', 14: '300x250'}
    }
)
df
df.set_index(['date', 'category'], inplace=True)
print(df.head())
df.groupby(level=[0]).sum()
print(df.head())
df.groupby(level=[0,1]).sum()
print(df.head())

In [None]:
# panadas multi level index sort df
_tbt_concat.sort_values(by=["days_lookbacks", "days_eval", "sym_freq_cnt"])
_tbt_concat.sort_values(by="Rank_d_CAGR/UI_m/s", ascending=False, inplace=True)

In [None]:
# force pip install in .venv
c:/Users/ping/MyDrive/py_files/python/py3810/.venv/Scripts/python.exe -m pip install ipykernel -U --force-reinstall

In [None]:
# jupyter version run from notebook
!jupyter --version

In [None]:
# python version running in notebook
from platform import python_version
print(python_version())

In [None]:
# Pandas drop_duplicates method not working on dataframe containing lists, drop duplicate rows
#  error: TypeError: unhashable type: 'list'
_df = df.astype(str)
_df = _df.drop_duplicates(subset=['date', 'days_lookbacks'])
_df

In [None]:
df_eval_data_test = df.iloc[:, :-1]  # drop last column
df_eval_SPY_test = df.iloc[:, -1:]  # keep only last column

In [None]:
# How to loop backwards in python? [duplicate]
for x in range(6, 2, -1):
  print(x)

In [None]:
# https://stackoverflow.com/questions/49700794/selecting-rows-of-pandas-dataframe-where-column-is-not-empty-list
# Selecting rows of pandas DataFrame where column is not empty list
df[df.my_list.astype(bool)]

In [None]:
# list duplicate in list
# https://stackoverflow.com/questions/9835762/how-do-i-find-the-duplicates-in-a-list-and-create-another-list-with-them
import collections
print([item for item, count in collections.Counter(symbols).items() if count > 1])

In [None]:
# # https://stackoverflow.com/questions/50773107/how-to-replace-infinite-value-with-maximum-value-of-a-pandas-column
# replace inf in column grp(CAGR/UI)_mean
df['grp(CAGR/UI)_mean'].replace(np.inf, grp_inf_replacement['grp(CAGR/UI)_mean'], inplace=True)
# replace NaN in column grp(CAGR/UI)_std
df['grp(CAGR/UI)_std'].replace(np.nan, grp_inf_replacement['grp(CAGR/UI)_std'], inplace=True)
# replace NaN in column grp(CAGR/UI)_mean/std
df['grp(CAGR/UI)_mean/std'].replace(np.nan, grp_inf_replacement['grp(CAGR/UI)_mean/std'], inplace=True)
# replace inf in column SPY_CAGR/UI
df['SPY_CAGR/UI'].replace(np.inf, SPY_inf_replacement, inplace=True)
df

In [None]:
# get values of _cols, where grp(CAGR/UI)_mean is max after filtering out inf
_cols = ['grp(CAGR/UI)_mean', 'grp(CAGR/UI)_std', 'grp(CAGR/UI)_mean/std']
_df_no_inf = df.loc[df['grp(CAGR/UI)_mean'] != np.inf]  # df with filter out inf in column grp(CAGR/UI)_mean 
_idx = _df_no_inf['grp(CAGR/UI)_mean'].idxmax()  # index value of max in grp(CAGR/UI)_mean 
grp_inf_replacement = _df_no_inf.loc[[_idx], _cols].squeeze()  # convert df (only has 1 row) to series
print(f'_idx: {_idx}')
grp_inf_replacement

In [None]:
# https://stackoverflow.com/questions/31674195/plot-normal-distribution-given-mean-and-sigma-python
# loc is mean, scale is standard deviation
import pylab
import numpy as np
from scipy.stats import norm
# x = np.linspace(-10000,100000,1000)
x = np.linspace(-40e+10,50e+10,1000)
y = norm.pdf(x, loc=2.562777e+10, scale=1.036925e+11)    # loc = mean, scale = standard deviation
# z = norm.pdf(x, loc=3.540615e+10, scale=1.194430e+11)    # for example
# z1 = norm.pdf(x, loc=298.805901, scale=826.875749)    # for example
# z1 = norm.pdf(x, loc=1.021825, scale=1.505096)    # for example
pylab.plot(x,y, 'b')
# pylab.plot(x,z, 'g')
# pylab.plot(x,z1, 'r')
pylab.show()

In [None]:
# drop inf, -inf, NaN from df

my_cols = ['grp(CAGR/UI)_mean', 'grp(CAGR/UI)_std', 'SPY_CAGR/UI']
_df = df[my_cols]
row_inf = _df.index[np.isinf(_df).any(axis=1)]
print(f'row iloc with inf:\n{row_inf}\n')
df_inf = df.iloc[row_inf].copy()  # df with inf in my_cols
# display(HTML(df_inf.to_html()))
df.drop(axis=0, index=row_inf, inplace=True)
# display(HTML(df.to_html()))
df.dropna(how='any', inplace=True)
df

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 14)
pd.set_option('display.max_colwidth', 6)
pd.set_option('display.width', 800)

In [None]:
# print df
from IPython.display import display, HTML
display(HTML(df.to_html()))

In [None]:
# create empty DataFrame df and append row
import pandas as pd

my_cols = ['date', 'days_lookbacks', 'n_top_syms', 'syms_start', 'syms_end', 'grp_dates', 'top_set_syms_n_freq', 'top_set_syms']
# Creating Empty DataFrame and Storing it in variable df
df = pd.DataFrame(columns=my_cols)

row_add = [grp_dates[-1], days_lookbacks, n_top_syms, syms_start, syms_end, grp_dates, top_set_syms_n_freq, top_set_syms]
df.loc[len(df)] = row_add
df

In [None]:
# flatten symbols_download_err which is a list-of-lists
l_symbols_err = [val for sublist in symbols_download_err for val in sublist]
pickle_dump(l_symbols_err, path_data_dump, 'l_symbols_err')
l_symbols_err

In [None]:
# drop columns in multilevel, column names is a list 
df = df.drop(l_syms_0_vol_0_close, axis=1, level=0)

In [None]:
# re-order multiIndex columns
_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
# create multilevel column names
_col_names = pd.MultiIndex.from_product([_l_symbols, _cols])
_df_adj.columns = _col_names

In [None]:
# get list of unique symbols in df_OHLCV, list(df) is a list of tuples
# e.g.: [('AAPL', 'Open')..('AAPL', 'Volume'),...
#        ('ZZZZ', 'Open')..('ZZZZ', 'Volume')]
symbols_OHLCV = list(set([i[0] for i in list(df)]))

In [None]:
# combined 2 lists to one list and remove duplicates, unique symbols
a = ['a', 'b', 'c']
b = ['a', 'b', 'c', 'd']
print(f'Order not kept: {list(set(a + b))}')

from collections import OrderedDict
print(f'Order kept:     {list(OrderedDict.fromkeys(a+b))}')

In [None]:
# convert pandas series to list, df['symbol'] is the symbol column in df
df['symbol'].tolist()

In [None]:
# Split a Python List into Chunks using For Loops
a_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
chunked_list = list()
chunk_size = 3
for i in range(0, len(a_list), chunk_size):
    chunked_list.append(a_list[i:i+chunk_size])
print(chunked_list)

In [None]:
# MultiIndex sort df by symbol
df_OHLCV_all_dates = df_OHLCV_all_dates.sort_index(axis=1,level=0,sort_remaining=False)

In [None]:
# rename column names from ['Open', ..., 'Volume'] to ['open', ..., 'volume']
# remove_unused_levels() prevents ValueError
# e.g ValueError: On level 1, code max (5) >= length of level (5). NOTE: this index is in an inconsistent state
# The error may be caused by removing symbols from the dataframe with all NaN in OHLCV columns
df_OHLCV_all_dates.columns = df_OHLCV_all_dates.columns.remove_unused_levels()
df_OHLCV_all_dates.columns = df_OHLCV_all_dates.columns.set_levels(['open', 'high', 'low', 'close', 'volume'], level=1)

In [None]:
# convert timestamp to string 'yyyy-mm-dd'
first_date = df1.index[0].strftime('%Y-%m-%d')
last_date = df1.index[-1].strftime('%Y-%m-%d')
print(f'first_date: {first_date}, {type(first_date)}')
print(f'last_date: {last_date}, {type(first_date)}')

# yesterday as a string yyyy-mm-dd
from datetime import date, timedelta
yesterday = str(date.today() - timedelta(days = 1))
print(f'yesterday: {yesterday}, {type(yesterday)}')

In [None]:
# https://stackoverflow.com/questions/34439/finding-what-methods-a-python-object-has
# list methods in object
obj_methods = [method_name for method_name in dir(obj)]
# simpler
dir(obj)

In [None]:
# convert Unix timestamp in sec. to yyyy-mm-dd,  'startDate': 1367107200
from datetime import datetime
UTC_timestamp_sec = 1367107200  # Unix time stamp (i.e. seconds since 1970-01-01)
# convert Unix UTC_timestamp_sec in sec. to yyyy-mm-dd,  'startDate': 1367107200
startDate = datetime.fromtimestamp(UTC_timestamp_sec).strftime("%Y-%m-%d")
startDate

In [None]:
# f string format , comma float
key = "marketCap"
value = 446764482560
print(f'{key:20}{value/1e9:<,.3f}B')  # asset in billions


In [None]:
# replace character in list
myList =  ['BRFS', 'BRK.A', 'BRK.B', 'BRKR']
new_list = []
for x in myList:
    new_list.append(x.replace(".", "-"))
new_list

In [None]:
# delete dfw, df2 from RAM
import gc
del [[dfw,df2]]
gc.collect()
dfw=pd.DataFrame()
df2=pd.DataFrame()

In [None]:
df = df[df['Customer Name'] == 'Canyon Coffee']  # return df with 'Customer Name' == 'Canyon Coffee' 

In [None]:
%%timeit -n 10 -r 7
(
    symbols,
    period_yr,
    drawdown,
    UI,
    max_drawdown,
    returns_std,
    Std_UI,
    CAGR,
    CAGR_Std,
    CAGR_UI,
) = symb_perf_stats_vectorized_v1(df_c)

In [None]:
df_adjOHLCV.columns.unique(level=0)  #multi index columns