In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandasql as psql

from scipy.stats import norm

In [2]:
pd.set_option('display.float_format', lambda x: '%.f' % x)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

### 1. Problem Definition

To predict stock price appreciation based on financial data (fundamental approach) for Indonesian public companies from 2020 until 2023.

### 2. Data Collection

#### 2.1. Target variables

The first dataset contains stock price appreciation data of Indonesian public companies from 2021 until 2023.

In [3]:
# WIP

#### 2.2. Predictor variables

The second dataset contains financial data of Indonesian public companies from 2020 until 2023.

In [4]:
df = pd.read_csv('combined_financial_data_idx.csv')

In [45]:
df.head(147)

Unnamed: 0,symbol,account,type,2020,2021,2022,2023
0,AALI,Accounts Payable,BS,770264000000,1026717000000,1224423000000,842064000000
1,AALI,Accounts Receivable,BS,765849000000,458135000000,848770000000,674487000000
2,AALI,Accumulated Depreciation,BS,-10920948000000,-12133813000000,-13303749000000,-14436847000000
3,AALI,Additional Paid In Capital,BS,3878995000000,3878995000000,3878995000000,3878995000000
4,AALI,Allowance For Doubtful Accounts Receivable,BS,-24261000000,-24543000000,-27057000000,-26516000000
5,AALI,Basic Average Shares,IS,1924688333,1924688333,1924688333,1924688333
6,AALI,Basic EPS,IS,433,1024,897,549
7,AALI,Beginning Cash Position,CF,383366000000,978892000000,3896022000000,1619616000000
8,AALI,Buildings And Improvements,BS,4775744000000,4858140000000,4926098000000,5028178000000
9,AALI,Capital Expenditure,CF,-999198000000,-1229482000000,-1379444000000,-1236337000000


### 3. Data Preprocessing

#### 3.1. Missing Values

In [17]:
df[df.isnull().any(axis=1)].head(5)

Unnamed: 0,symbol,account,type,2020,2021,2022,2023
10,AALI,Capital Expenditure Reported,CF,-28687000000.0,,,
48,AALI,Impairment Of Capital Assets,IS,50935000000.0,-49898000000.0,,
80,AALI,Net Other Financing Charges,CF,,37785000000.0,,44612000000.0
81,AALI,Net Other Investing Changes,CF,,42794000000.0,237226000000.0,215362000000.0
99,AALI,Other Non Current Liabilities,BS,246459000000.0,401762000000.0,216803000000.0,


In [18]:
df = df.fillna(0)

#### 3.2. Skewed Distribution

In [8]:
# WIP

#### 3.3. Outliers

In [9]:
# WIP

### 4. Exploratory Data Analysis

### 4.1. Target Variables

In [10]:
# WIP

# Future stock price:
# 1 April 2021 - 1 April 2022; to predict the effect of AR 2021 (fundamental) on stock price
# 1 April 2022 - 1 April 2023; to predict the effect of AR 2022 (fundamental) on stock price
# 1 April 2023 - 1 April 2024; to predict the effect of AR 2023 (fundamental) on stock price

# Future dividends

# Future earnings

### 4.2. Predictor Variables

In [11]:
ticker_symbols = df['symbol'].unique().tolist()

In [12]:
print(ticker_symbols)

['AALI', 'ABBA', 'ABDA', 'ABMM', 'ACES', 'ACST', 'ADCP', 'ADES', 'ADHI', 'ADMF', 'ADMG', 'ADMR', 'ADRO', 'AGII', 'AGRO', 'AGRS', 'AISA', 'AKPI', 'AKRA', 'ALDO', 'ALII', 'ALMI', 'AMAG', 'AMAN', 'AMAR', 'AMFG', 'AMMN', 'AMOR', 'AMRT', 'ANDI', 'ANJT', 'ANTM', 'APEX', 'APIC', 'APLN', 'ARCI', 'ARGO', 'ARII', 'ARKA', 'ARKO', 'ARMY', 'ARNA', 'ARTI', 'ARTO', 'ASDM', 'ASGR', 'ASII', 'ASLC', 'ASMI', 'ASRI', 'ASRM', 'ASSA', 'ATIC', 'ATLA', 'AUTO', 'AVIA', 'BABP', 'BACA', 'BAJA', 'BALI', 'BANK', 'BAPA', 'BATA', 'BBCA', 'BBHI', 'BBKP', 'BBLD', 'BBMD', 'BBNI', 'BBRI', 'BBRM', 'BBSI', 'BBTN', 'BBYB', 'BCAP', 'BCIC', 'BCIP', 'BDKR', 'BDMN', 'BEBS', 'BEEF', 'BEKS', 'BELI', 'BEST', 'BFIN', 'BGTG', 'BHAT', 'BHIT', 'BINA', 'BIPI', 'BIPP', 'BIRD', 'BISI', 'BJBR', 'BJTM', 'BKDP', 'BKSL', 'BKSW', 'BLTA', 'BLTZ', 'BMAS', 'BMHS', 'BMRI', 'BMSR', 'BMTR', 'BNBA', 'BNBR', 'BNGA', 'BNII', 'BNLI', 'BOGA', 'BOLA', 'BOLT', 'BOSS', 'BPFI', 'BPII', 'BRAM', 'BREN', 'BRIS', 'BRMS', 'BRNA', 'BRPT', 'BSBK', 'BSDE', 'BSIM',

In [13]:
accounts = df['account'].unique().tolist()

In [14]:
accounts

['Accounts Payable',
 'Accounts Receivable',
 'Accumulated Depreciation',
 'Additional Paid In Capital',
 'Allowance For Doubtful Accounts Receivable',
 'Basic Average Shares',
 'Basic EPS',
 'Beginning Cash Position',
 'Buildings And Improvements',
 'Capital Expenditure',
 'Capital Expenditure Reported',
 'Capital Stock',
 'Cash And Cash Equivalents',
 'Cash Cash Equivalents And Short Term Investments',
 'Cash Dividends Paid',
 'Cash Equivalents',
 'Cash Financial',
 'Cash Flowsfromusedin Operating Activities Direct',
 'Changes In Cash',
 'Classesof Cash Payments',
 'Classesof Cash Receiptsfrom Operating Activities',
 'Common Stock',
 'Common Stock Dividend Paid',
 'Common Stock Equity',
 'Construction In Progress',
 'Cost Of Revenue',
 'Current Assets',
 'Current Debt',
 'Current Debt And Capital Lease Obligation',
 'Current Liabilities',
 'Depreciation And Amortization In Income Statement',
 'Depreciation Income Statement',
 'Diluted Average Shares',
 'Diluted EPS',
 'Diluted NI Ava

In [43]:
query = '''
    SELECT symbol, SUM("2020") AS sum_2020, SUM("2021") AS sum_2021, SUM("2022") AS sum_2022, SUM("2023") AS sum_2023
    FROM df
    WHERE symbol IS 'AALI' AND account IS 'Accounts Payable'
        OR symbol IS 'AALI' AND account IS 'Accounts Receivable' 
'''
result = psql.sqldf(query)

In [44]:
result.head()

Unnamed: 0,symbol,sum_2020,sum_2021,sum_2022,sum_2023
0,AALI,1536113000000,1484852000000,2073193000000,1516551000000


### 5. Feature Selection

In [15]:
# WIP

### 6. Data Modelling

In [16]:
# WIP