In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from scipy.stats import norm

In [2]:
# pd.set_option('display.float_format', lambda x: '%.f' % x)
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

### 1. Problem Definition

To predict stock price appreciation based on financial data (fundamental approach) for Indonesian public companies from 2020 until 2023.

### 2. Data Collection

In [3]:
df = pd.read_csv('combined_financial_data_idx.csv')

In [4]:
df.head()

Unnamed: 0,symbol,account,type,2020,2021,2022,2023
0,AALI,Accounts Payable,BS,770264000000.0,1026717000000.0,1224423000000.0,842064000000.0
1,AALI,Accounts Receivable,BS,765849000000.0,458135000000.0,848770000000.0,674487000000.0
2,AALI,Accumulated Depreciation,BS,-10920950000000.0,-12133810000000.0,-13303750000000.0,-14436850000000.0
3,AALI,Additional Paid In Capital,BS,3878995000000.0,3878995000000.0,3878995000000.0,3878995000000.0
4,AALI,Allowance For Doubtful Accounts Receivable,BS,-24261000000.0,-24543000000.0,-27057000000.0,-26516000000.0


In [5]:
df = df.pivot(index='symbol', columns='account', values=['2020', '2021', '2022', '2023'])
df.columns = [f'{col[1]}_{col[0]}' for col in df.columns]
df.reset_index(inplace=True)

In [6]:
df.head()

Unnamed: 0,symbol,Accounts Payable_2020,Accounts Receivable_2020,Accumulated Depreciation_2020,Additional Paid In Capital_2020,Allowance For Doubtful Accounts Receivable_2020,Amortization_2020,Assets Held For Sale Current_2020,Available For Sale Securities_2020,Average Dilution Earnings_2020,...,Total Tax Payable_2023,Total Unusual Items_2023,Total Unusual Items Excluding Goodwill_2023,Tradeand Other Payables Non Current_2023,Trading Securities_2023,Treasury Shares Number_2023,Treasury Stock_2023,Work In Process_2023,Working Capital_2023,Write Off_2023
0,AALI,770264000000.0,765849000000.0,-10920950000000.0,3878995000000.0,-24261000000.0,,,,,...,120237000000.0,74901000000.0,74901000000.0,,,0.0,,109914000000.0,3236061000000.0,-23841000000.0
1,ABBA,38356420000.0,15894150000.0,-176454800000.0,-101245400000.0,-68413900000.0,,,82256400000.0,,...,17183100000.0,,,102609000000.0,,,,,-102591900000.0,
2,ABDA,7686395000.0,80136870000.0,-90880030000.0,8109426000.0,,,,,,...,3849437000.0,8119158000.0,8119158000.0,,,,,,,0.0
3,ABMM,115627400.0,138614900.0,-685938100.0,115087200.0,-52426830.0,,0.0,20028870.0,,...,9567298.0,,,,,0.0,,10392170.0,-8958664.0,
4,ACES,164227200000.0,143482700000.0,-839618000000.0,440574900000.0,-55110070.0,3706185000.0,,,,...,73710710000.0,-573900600.0,-573900600.0,,,29610300.0,34184870000.0,,4898755000000.0,54638700.0


In [7]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Accounts Payable_2020,581.0,4.966911e+11,1.706642e+12,0.000000e+00,6.615148e+09,8.067720e+10,3.300470e+11,1.699900e+13
Accounts Receivable_2020,530.0,5.727652e+11,1.676043e+12,0.000000e+00,4.702138e+09,8.769114e+10,4.007902e+11,1.707864e+13
Accumulated Depreciation_2020,588.0,-2.057908e+12,1.001841e+13,-1.664430e+14,-1.039924e+12,-1.984147e+11,-2.376867e+10,0.000000e+00
Additional Paid In Capital_2020,577.0,8.624769e+11,3.342041e+12,-4.111710e+12,4.570000e+08,8.167701e+10,5.698250e+11,6.328082e+13
Allowance For Doubtful Accounts Receivable_2020,405.0,-8.417233e+10,4.783591e+11,-8.360000e+12,-3.574800e+10,-3.776131e+09,-1.267331e+08,0.000000e+00
...,...,...,...,...,...,...,...,...
Treasury Shares Number_2023,216.0,7.915106e+08,8.925468e+09,0.000000e+00,0.000000e+00,1.045300e+04,1.054315e+08,1.308512e+11
Treasury Stock_2023,116.0,3.022673e+11,9.796827e+11,0.000000e+00,7.526860e+08,2.250151e+10,1.221651e+11,8.199511e+12
Work In Process_2023,143.0,9.994955e+10,2.273102e+11,0.000000e+00,1.640456e+09,1.576900e+10,8.045000e+10,1.836794e+12
Working Capital_2023,457.0,1.071514e+12,4.735861e+12,-1.987207e+13,8.225521e+06,1.602316e+11,9.218260e+11,4.116400e+13


### 3. Data Preprocessing

#### 3.1. Missing Values

In [8]:
df[df.isnull().any(axis=1)].head(5)

Unnamed: 0,symbol,Accounts Payable_2020,Accounts Receivable_2020,Accumulated Depreciation_2020,Additional Paid In Capital_2020,Allowance For Doubtful Accounts Receivable_2020,Amortization_2020,Assets Held For Sale Current_2020,Available For Sale Securities_2020,Average Dilution Earnings_2020,...,Total Tax Payable_2023,Total Unusual Items_2023,Total Unusual Items Excluding Goodwill_2023,Tradeand Other Payables Non Current_2023,Trading Securities_2023,Treasury Shares Number_2023,Treasury Stock_2023,Work In Process_2023,Working Capital_2023,Write Off_2023
0,AALI,770264000000.0,765849000000.0,-10920950000000.0,3878995000000.0,-24261000000.0,,,,,...,120237000000.0,74901000000.0,74901000000.0,,,0.0,,109914000000.0,3236061000000.0,-23841000000.0
1,ABBA,38356420000.0,15894150000.0,-176454800000.0,-101245400000.0,-68413900000.0,,,82256400000.0,,...,17183100000.0,,,102609000000.0,,,,,-102591900000.0,
2,ABDA,7686395000.0,80136870000.0,-90880030000.0,8109426000.0,,,,,,...,3849437000.0,8119158000.0,8119158000.0,,,,,,,0.0
3,ABMM,115627400.0,138614900.0,-685938100.0,115087200.0,-52426830.0,,0.0,20028870.0,,...,9567298.0,,,,,0.0,,10392170.0,-8958664.0,
4,ACES,164227200000.0,143482700000.0,-839618000000.0,440574900000.0,-55110070.0,3706185000.0,,,,...,73710710000.0,-573900600.0,-573900600.0,,,29610300.0,34184870000.0,,4898755000000.0,54638700.0


In [9]:
# Imputing null values with zero
df = df.fillna(0)

#### 3.2. Skewed Distribution

In [10]:
# WIP

#### 3.3. Outliers

In [11]:
# WIP

### 4. Exploratory Data Analysis

### 4.1. Target Variables

In [12]:
# WIP

# Future stock price:
# 1 April 2021 - 1 April 2022; to predict the effect of AR 2021 (fundamental) on stock price
# 1 April 2022 - 1 April 2023; to predict the effect of AR 2022 (fundamental) on stock price
# 1 April 2023 - 1 April 2024; to predict the effect of AR 2023 (fundamental) on stock price

# Future dividends

# Future earnings

### 4.2. Predictor Variables

In [13]:
# WIP

#### 4.2.1. Return on Equity (ROE)

In [14]:
years = [2020, 2021, 2022, 2023]

for year in years:
    net_income_col = f"Net Income_{year}"
    total_equity_col = f"Total Equity Gross Minority Interest_{year}"
    new_col = f"ROA_{year}"
    
    df[new_col] = df[net_income_col] / df[total_equity_col]

#### 4.2.2. Retention Ratio (RR)

In [15]:
for year in years:
    dividend_col = f"Cash Dividends Paid_{year}"
    net_income_col = f"Net Income_{year}"
    new_col = f"RR_{year}"
    
    df[new_col] = df[dividend_col] / df[net_income_col]

#### 4.2.3. Sustainability Growth Rate (SGR)

In [16]:
for year in years:
    ROA_col = f"ROA_{year}"
    RR_col = f"RR_{year}"
    new_col = f"SGR_{year}"
    
    df[new_col] = df[ROA_col] * df[RR_col]

### 5. Feature Selection

In [17]:
variables = ["symbol",
             "Net Income_2023",
             "SGR_2020", "SGR_2021", "SGR_2022"]

df = df.drop(columns=[col for col in df.columns if col not in variables])

In [18]:
df.head()

Unnamed: 0,symbol,Net Income_2023,SGR_2020,SGR_2021,SGR_2022
0,AALI,1055897000000.0,-0.009099,-0.02318,-0.038415
1,ABBA,-43705040000.0,-0.0,0.0,0.0
2,ABDA,84581310000.0,-0.026851,-0.027507,-0.03361
3,ABMM,289000600.0,-0.015597,0.0,-0.107696
4,ACES,763507500000.0,-0.05788,-0.09866,-0.059405


### 5.1. Target Variables

In [19]:
y = df["Net Income_2023"]

### 5.2. Predictor Variables

In [20]:
X = df[["SGR_2020", "SGR_2021", "SGR_2022"]]

In [25]:
df[df.isnull().any(axis=1)].head(5)

Unnamed: 0,symbol,Net Income_2023,SGR_2020,SGR_2021,SGR_2022
40,ARMY,0.0,0.0,,
77,BDKR,0.0,,0.0,0.0
189,DOOH,427796101.0,,0.0,0.0
224,GAMA,0.0,0.0,0.0,
234,GOLL,0.0,0.0,,


In [28]:
# Imputing null values with zero
X = X.fillna(0)

### 6. Data Modelling

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge

In [30]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
# Ridge regression with cross-validation
ridge_model = Ridge()
ridge_scores = cross_val_score(ridge_model, X, y, cv=5, scoring='r2')

print(f"Cross-Validated Ridge R^2 Scores: {ridge_scores}")
print(f"Mean Cross-Validated Ridge R^2 Score: {ridge_scores.mean()}")

Cross-Validated Ridge R^2 Scores: [-0.04774876 -0.02870421  0.06579858 -1.83224406  0.03376325]
Mean Cross-Validated Ridge R^2 Score: -0.36182703880617584
