In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Loading data
file_path = Path("../../stock_data.csv")
df_stock = pd.read_csv(file_path)
df_stock.head()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Name
0,87.300003,87.300003,64.904999,71.699997,71.699997,33947.0,MKDTY
1,72.900002,72.900002,62.25,65.550003,65.550003,13607.0,MKDTY
2,63.299999,66.75,63.299999,65.25,65.25,4427.0,MKDTY
3,65.550003,72.0,63.75,67.650002,67.650002,9933.0,MKDTY
4,68.25,72.419998,68.099998,72.0,72.0,8347.0,MKDTY


In [7]:
# List all the unique values in the Name column
pd.unique(df_stock['Name'])

array(['MKDTY', 'INDO', 'MNPR', ..., 'VIGL', 'SCRM', 'CITE'], dtype=object)

In [9]:
# List and count all of the values in the Name column
df_stock.Name.value_counts()

INTZ    7672
ALRS    4943
EVO     4721
WTRE    3884
ABST    3456
        ... 
LASE      25
PRME      11
AQU       11
MBLY       7
SVII       5
Name: Name, Length: 1577, dtype: int64

In [11]:
# Put the ticker names in a list.
ticker_names = pd.unique(df_stock['Name']).tolist()
ticker_names

['MKDTY',
 'INDO',
 'MNPR',
 'OCFT',
 'SPT',
 'BILL',
 'EH',
 'XP',
 'LMPX',
 'PINE',
 'CAN',
 'SITM',
 'YAYO',
 'ETNB',
 'CNSP',
 'KRKR',
 'MOHOY',
 'TELA',
 'CNTG',
 'GRTX',
 'SI',
 'FLJ',
 'DUO',
 'OYST',
 'RAPT',
 'AIH',
 'CABA',
 'DAO',
 'HAPP',
 'PGNY',
 'PHAT',
 'TFFP',
 'BRP',
 'BRBR',
 'IPHA',
 'HBT',
 'VIR',
 'BNTX',
 'APRE',
 'FREQ',
 'MCBS',
 'OPRT',
 'PTON',
 'DDOG',
 'PING',
 'XGN',
 'IGMS',
 'NVST',
 'ALRS',
 'NET',
 'STSA',
 'SWTX',
 'SDC',
 'TXG',
 'CFB',
 'JFU',
 'BTOG',
 'INMD',
 'AMTD',
 'DT',
 'KRUS',
 'SNDL',
 'BORR',
 'BHAT',
 'VIST',
 'CSTL',
 'HCAT',
 'NOVA',
 'EIC',
 'AFYA',
 'IFS',
 'AMK',
 'CPAA',
 'FULC',
 'IHRT',
 'MIRM',
 'ORCC',
 'PHR',
 'DOYU',
 'THCA',
 'RMBI',
 'KRTX',
 'REAL',
 'ADPT',
 'BBIO',
 'CHNG',
 'MORF',
 'CMBM',
 'AKRO',
 'BCEL',
 'GO',
 'PSNL',
 'STOK',
 'CHWY',
 'FVRR',
 'CRWD',
 'RVLV',
 'GOTU',
 'BCYC',
 'IDYA',
 'RTLR',
 'AVTR',
 'FSLY',
 'LKNCY',
 'PSTL',
 'AGBA',
 'APLT',
 'JFIN',
 'SONM',
 'UBER',
 'AXLA',
 'HHR',
 'MEC',
 'NXTC',
 '

In [21]:
# Sum all of the volumes by ticker
volume_df = df_stock.groupby(['Name']).sum()['Volume']
volume_df

Name
AAC     124711002.0
AACI      5613900.0
AAQC     20023600.0
ABCL    909812385.0
ABCM     86635409.0
           ...     
ZNTL    249682610.0
ZT       11581000.0
ZVIA     90398140.0
ZWRK     14330600.0
ZY      542000193.0
Name: Volume, Length: 1577, dtype: float64

In [23]:
# For testing purposes, group all the tickers and average all of the numeric values
mean_df = df_stock.groupby(['Name']).mean()
mean_df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAC,9.811500,9.827642,9.792248,9.810951,9.810951,3.056642e+05
AACI,9.862830,9.868858,9.854862,9.860992,9.860992,2.272834e+04
AAQC,9.744370,9.756253,9.733660,9.745912,9.745912,5.325426e+04
ABCL,18.973761,19.588749,18.141678,18.799927,18.799927,1.903373e+06
ABCM,18.706641,19.016277,18.504170,18.785458,18.785458,1.688799e+05
...,...,...,...,...,...,...
ZNTL,45.087175,46.636250,43.459098,45.027971,45.027971,3.823623e+05
ZT,9.730362,9.737759,9.721941,9.730120,9.730120,3.772313e+04
ZVIA,7.002730,7.267589,6.723141,6.982638,6.982638,2.772949e+05
ZWRK,9.767933,9.776177,9.758736,9.767264,9.767264,3.564826e+04


In [30]:
# Get descriptive statistics for the mean_df
max = mean_df.describe()['Volume'].max()
max

51811353.262759924

In [None]:
# Cut the mean_df into the volume ranges
volume_bins = []