In [1]:
import requests
import random
import numpy as np
import pandas as pd
import string #Required for generating random stock tickers

# 1. Retrieving Real Stock Data with API

To start off, we obtain the Top 200 stocks by market capitalization (named as price in the code) from US-listed exchanges. 

We also include a try-except block to ensure that any errors during requests are caught and reported.

In [3]:
stock_url = "https://financialmodelingprep.com/api/v3/stock/list?apikey=NSD2m35XyjrwOoYdtKbq1JPOHlABl8CW"

try:
    stock_response = requests.get(stock_url)
    stock_response.raise_for_status()
    stock_data = stock_response.json()

    # Check API response structure
    print("Sample API Response:")
    print(stock_data[:5])

    # Filter US-listed stocks only
    us_exchanges = ['NYSE', 'NASDAQ', 'AMEX']

    valid_stocks = [
        stock for stock in stock_data
        if stock.get('exchangeShortName') in us_exchanges  # US-listed exchanges
        and stock.get('type') == 'stock'  # Only include stocks
        and stock.get('price') is not None  # Ensure price is not None
    ]

    print(f"Total valid stocks: {len(valid_stocks)}")
    print("Sample valid stocks:", valid_stocks[:5])

    # Sort by price, handling None values safely
    sorted_stocks = sorted(valid_stocks, key=lambda x: x.get('price', 0), reverse=True)

    # Get the top 200 stocks
    top_200_stocks = sorted_stocks[:200]
    stock_symbols = [stock['symbol'] for stock in top_200_stocks]

    print("Top 200 Stocks by Price (Placeholder for Market Cap):")
    print(stock_symbols)

except Exception as e:
    print(f"Error: {e}")

Sample API Response:
[{'symbol': 'NPOF.ME', 'name': 'FIZIKA Scientific and Production Association Open Joint-Stock Company', 'price': 11850, 'exchange': 'Moscow Stock Exchange', 'exchangeShortName': 'MCX', 'type': 'stock'}, {'symbol': 'TERRAREAL.BO', 'name': 'Terraform Realstate Limited', 'price': 6.5, 'exchange': 'Bombay Stock Exchange', 'exchangeShortName': 'BSE', 'type': 'stock'}, {'symbol': 'KCPSUGIND.NS', 'name': 'K.C.P. Sugar and Industries Corporation Limited', 'price': 46.97, 'exchange': 'National Stock Exchange of India', 'exchangeShortName': 'NSE', 'type': 'stock'}, {'symbol': 'RKEC.NS', 'name': 'RKEC Projects Limited', 'price': 99.79, 'exchange': 'National Stock Exchange of India', 'exchangeShortName': 'NSE', 'type': 'stock'}, {'symbol': 'PMGOLD.AX', 'name': 'Perth Mint Gold', 'price': 17.94, 'exchange': 'Australian Securities Exchange', 'exchangeShortName': 'ASX', 'type': 'etf'}]
Total valid stocks: 11005
Sample valid stocks: [{'symbol': 'BUPXX', 'name': 'BlackRock Liquidit

For each of these Top 200 stocks, we will then retrieve its annual income statement data with an API request.

The API response is stored in a list of dictionary to extract relevant metrics like Year, Revenue and Net Income.

Hence, each record corresponds to a specific stock and year, with detailed financial metrics.

Again, error handling during API requests is included and reported.

In [4]:
data = []

# For each valid stock, fetch income data and store it
for stock in stock_symbols:
    try:
        # Build the API URL for income statement
        income_url = f"https://financialmodelingprep.com/api/v3/income-statement/{stock}?period=annual&apikey=NSD2m35XyjrwOoYdtKbq1JPOHlABl8CW"
        
        # Fetch data from the API
        response = requests.get(income_url, timeout=10)  # Added timeout
        
        # Check for a successful response
        if response.status_code != 200:
            print(f"Failed to fetch data for {stock}: HTTP {response.status_code}")
            continue
        
        # Parse JSON response
        income_response = response.json()

        # Check if data is available and is in the expected format (a list of dictionaries)
        if isinstance(income_response, list) and income_response:
            for report in income_response:  # Loop over each year
                # Extract the year safely
                year = report.get('date', '')
                year = year.split('-')[0] if '-' in year else year

                # Create a dictionary for the stock report
                row = {
                    'Stock': stock,
                    'Year': year,
                    'Revenue': report.get('revenue'),
                    'Revenue Growth': report.get('revenueGrowth'),
                    'Operating Income': report.get('operatingIncome'),
                    'Net Income': report.get('netIncome'),
                    'Net Income Ratio': report.get('netIncomeRatio'),
                    'Gross Profit': report.get('grossProfit'),
                    'Gross Profit Ratio': report.get('grossProfitRatio'),
                    'Operating Expenses': report.get('operatingExpenses'),
                    'Cost of Revenue': report.get('costOfRevenue'),
                    'EBITDA': report.get('ebitda'),
                    'Interest Expense': report.get('interestExpense'),
                    'Depreciation & Amortization': report.get('depreciationAndAmortization')
                }
                # Append the row to the data list
                data.append(row)
        else:
            print(f"No income data available for {stock}.")
    except Exception as e:
        print(f"Error fetching data for {stock}: {e}")

# Output the data to check
print(f"Fetched data for {len(data)} records.")

Failed to fetch data for MCK: HTTP 429
Failed to fetch data for ISRG: HTTP 429
Failed to fetch data for MLM: HTTP 429
Failed to fetch data for AMP: HTTP 429
Failed to fetch data for MUSA: HTTP 429
Failed to fetch data for NEU: HTTP 429
Failed to fetch data for ROP: HTTP 429
Failed to fetch data for WSO-B: HTTP 429
Failed to fetch data for SAIA: HTTP 429
Failed to fetch data for MA: HTTP 429
Failed to fetch data for TMO: HTTP 429
Failed to fetch data for CHE: HTTP 429
Failed to fetch data for SNPS: HTTP 429
Failed to fetch data for WSO: HTTP 429
Failed to fetch data for SPGI: HTTP 429
Failed to fetch data for NWLI: HTTP 429
Failed to fetch data for IT: HTTP 429
Failed to fetch data for KNSL: HTTP 429
Failed to fetch data for CACC: HTTP 429
Failed to fetch data for LMT: HTTP 429
Failed to fetch data for FDS: HTTP 429
Failed to fetch data for MCO: HTTP 429
Failed to fetch data for UNH: HTTP 429
Failed to fetch data for ANTM: HTTP 429
Failed to fetch data for NOC: HTTP 429
Failed to fetch 

In [5]:
df = pd.DataFrame(data)

In [6]:
df.shape

(240, 14)

In [7]:
# Checking all stocks requested are inside df
df['Stock'].nunique()

48

# 2. Obtaining Base, Partial, Historical & Sparse Datasets

### Base Dataset

We create a synthetic dataset of 100,000 rows containing simulated financial metrics for various stocks across year 2000 to 2024.

Using generate_random_tickers, we create unique stocks of random lengths and create stock-year combinations. We then simulate the other financial metrics proportionally based on the revenue. For example, cost of revenue is 40-70% of revenue.

We then verified the data integrity by checking no duplicate stock-year combination exists.

In [29]:
# Define parameters for simulation
# Define parameters
n_desired_rows = 100000  # Target number of unique rows
years = list(range(2000, 2025))  # Expand the range of years (e.g., 2000-2024)
n_unique_stocks_needed = n_desired_rows // len(years) + 1  # Number of unique stocks needed

# Generate fake stock tickers
def generate_random_tickers(n, length_range=(4, 6)):
    tickers = set()  # Use a set to ensure uniqueness
    while len(tickers) < n:
        length = random.randint(*length_range)  # Random length between 4 and 6
        ticker = ''.join(random.choices(string.ascii_uppercase, k=length))
        tickers.add(ticker)
    return list(tickers)

# Define the number of unique stock tickers needed
n_unique_stocks_needed = n_desired_rows // len(years) + 1

# Generate random stock tickers
base_stocks = generate_random_tickers(n_unique_stocks_needed)

# Generate all possible stock-year combinations
stock_year_combinations = [(stock, year) for stock in base_stocks for year in years]

# Shuffle the combinations to randomize the order
np.random.seed(42)  # Ensure reproducibility
random.shuffle(stock_year_combinations)

# Select exactly 100,000 rows
stock_year_combinations = stock_year_combinations[:n_desired_rows]

# Initialize an empty list to store simulated data
simulated_data = []

# Simulate data for unique stock-year combinations
for i, (stock, year) in enumerate(stock_year_combinations):
    # Use previous year's revenue for this stock (if available)
    if i > 0 and simulated_data[-1]['Stock'] == stock:
        previous_revenue = simulated_data[-1]['Revenue']
    else:
        previous_revenue = np.random.randint(1e6, 2.1e9)  # Random initial revenue if no previous data

    # Calculate revenue based on growth
    revenue_growth = np.random.uniform(-0.1, 0.2)  # Random growth rate (-10% to 20%)
    revenue = previous_revenue * (1 + revenue_growth)
    
    # Simulate financial metrics
    cost_of_revenue = revenue * np.random.uniform(0.4, 0.7)
    gross_profit = revenue - cost_of_revenue
    operating_expenses = np.random.uniform(0.2, 0.4) * revenue
    operating_income = gross_profit - operating_expenses
    depreciation_and_amortization = np.random.randint(1e5, 1e7)
    ebitda = operating_income + depreciation_and_amortization
    interest_expense = np.random.randint(1e5, 1e6)
    net_income = operating_income - interest_expense
    net_income_ratio = net_income / revenue if revenue != 0 else 0
    gross_profit_ratio = gross_profit / revenue if revenue != 0 else 0

    # Append to simulated data
    simulated_data.append({
        'Stock': stock,
        'Year': year,
        'Revenue': revenue,
        'Revenue Growth': revenue_growth,
        'Operating Income': operating_income,
        'Net Income': net_income,
        'Net Income Ratio': net_income_ratio,
        'Gross Profit': gross_profit,
        'Gross Profit Ratio': gross_profit_ratio,
        'Operating Expenses': operating_expenses,
        'Cost of Revenue': cost_of_revenue,
        'EBITDA': ebitda,
        'Interest Expense': interest_expense,
        'Depreciation & Amortization': depreciation_and_amortization,
    })

# Convert to a DataFrame
simulated_df = pd.DataFrame(simulated_data)

# Verify duplicates
duplicates = simulated_df.duplicated(subset=['Stock', 'Year'], keep=False)
print(f"Number of duplicates: {duplicates.sum()}")

# Display the first rows
print(simulated_df.head())

Number of duplicates: 0
    Stock  Year       Revenue  Revenue Growth  Operating Income    Net Income  \
0  QZJZTU  2021  1.833317e+09        0.138963      3.465545e+08  3.459331e+08   
1  AURIJB  2000  2.599285e+08        0.037775      7.052226e+07  7.035744e+07   
2   GXUAV  2006  2.254691e+09        0.116600      2.666813e+08  2.663455e+08   
3  ZQDDFB  2017  5.011669e+08       -0.008727      7.827428e+07  7.767409e+07   
4    ETEY  2004  4.910126e+08        0.019958      9.390576e+07  9.374003e+07   

   Net Income Ratio  Gross Profit  Gross Profit Ratio  Operating Expenses  \
0          0.188692  9.991022e+08            0.544970        6.525477e+08   
1          0.270680  1.299350e+08            0.499887        5.941274e+07   
2          0.118130  7.179707e+08            0.318434        4.512894e+08   
3          0.154986  2.218030e+08            0.442573        1.435287e+08   
4          0.190912  2.877335e+08            0.586000        1.938277e+08   

   Cost of Revenue        

Importantly, we concatenate the simulated_df with the initial df containing the real stocks to generate the base dataset.

In [30]:
expanded_df = pd.concat([df, simulated_df], ignore_index=True)

In [None]:
expanded_df.to_csv('expanded_dataset.csv', index=False)

In [31]:
print(f"Expanded dataset has {len(expanded_df)} rows.")

Expanded dataset has 100240 rows.


### Partial Dataset

The partial dataset overlaps with the base dataset, but has missing values in some columns.

The year range is limited to 2022 to 2023.

In [32]:
# Create a deep copy of expanded_df, with filters by 2022, 2023
partial_df = expanded_df[expanded_df['Year'].isin([2022,2023])].copy()

#Introduce missing data in 'Revenue', 'Net Income' and 'Operating Expenses'
partial_df.loc[partial_df.sample(frac=0.2).index, 'Revenue'] = np.nan #Within revenue, introduce 20% of missing data
partial_df.loc[partial_df.sample(frac=0.1).index, 'Net Income'] = np.nan #Within net income, introduce 10% of missing data
partial_df.loc[partial_df.sample(frac=0.3).index, 'Operating Expenses'] = np.nan #Within operating expenses, introduce 30% of missing data

In [7]:
partial_df.to_csv('partial_dataset.csv', index=False)

In [33]:
partial_df.head()

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,Cost of Revenue,EBITDA,Interest Expense,Depreciation & Amortization
0,BRK-A,2023,364482000000.0,,123196000000.0,,0.263999,123196000000.0,0.338003,198575000000.0,241286000000.0,137655000000.0,5003000000.0,12486000000.0
1,BRK-A,2022,,,-26985000000.0,-22819000000.0,-0.097438,109871000000.0,0.469153,,124319000000.0,-15249000000.0,4352000000.0,10899000000.0
5,NVR,2023,9534889000.0,,1822455000.0,1591611000.0,0.166925,2483691000.0,0.260485,,7051198000.0,1973029000.0,27740000.0,16916000.0
6,NVR,2022,10538290000.0,,2169184000.0,1725575000.0,0.163743,2876016000.0,0.272911,,7662271000.0,2310114000.0,39524000.0,17396000.0
10,BKNG,2023,21365000000.0,,5835000000.0,4289000000.0,0.200749,21360000000.0,0.999766,,5000000.0,7043000000.0,897000000.0,665000000.0


### Historical Dataset

The historical dataset introduces a larger range of years (but randomized), with mismatched data and duplicate records.

Stocks in this dataset are referenced from the stocks in the base dataset.

In [34]:
historical_data = []
historical_years = range(2010, 2021) # Year 2010 to 2021
historical_stocks = expanded_df['Stock']

In [37]:
#Simulate 10000 rows of historical data
for row in range(10000):
    stock = np.random.choice(historical_stocks)
    year = np.random.choice(historical_years)

    revenue = np.random.randint(1e6, 2.1e9) if np.random.rand() > 0.2 else None  # 20% missing revenue
    
    # Introduce some unrealistic (invalid) values for revenue
    if np.random.rand() < 0.05:  # 5% chance to introduce an unrealistic value
        revenue = np.random.randint(-1e9, -1e6)  # Negative revenue (invalid)

    # Introduce mismatched values for 'Cost of Revenue' by using None instead of NaN
    cost_of_revenue = revenue * np.random.uniform(0.3, 0.8) if revenue else None

    # Introduce stock name mismatch randomly
    if np.random.rand() < 0.05:
        stock = stock + ' .end'

    # Introduce missing values in 'Cost of Revenue'
    if np.random.rand() < 0.1:
        cost_of_revenue = None

    # Add fewer financial metrics (to create schema differences)
    if revenue and cost_of_revenue:
        gross_profit = revenue - cost_of_revenue
    else:
        gross_profit = None

    # Introduce random years outside of range
    if np.random.rand() > 0.05:
        year = np.random.randint(2130, 2200)

    historical_data.append({
        'Stock': stock,
        'Year': year,
        'Revenue': revenue,
        'Cost of Revenue': cost_of_revenue,
        'Gross Profit': gross_profit
    })

In [38]:
historical_df = pd.DataFrame(historical_data)

### Sparse Dataset

Finally, the sparse dataset only contains 100 random stocks, with year and revenue as their only metrics.

Like the Historical dataset, stocks are refrenced from those in the base dataset.

50% of the revenue column will be empty.

In [39]:
sparse_data = []
sparse_years = range(2000, 2009)
sparse_stocks = expanded_df['Stock']

In [40]:
for stock in np.random.choice(sparse_stocks, size=100, replace=False): #Select 100 random stocks
    for year in sparse_years:
        if np.random.rand() > 0.5: #For 50% of the year column, assign a random revenue. The other 50% will be uninitialized
            revenue = np.random.randint(1e6, 2.1e9)
            
            sparse_data.append({
                'Stock': stock,
                'Year': year,
                'Revenue': revenue
            })

In [41]:
sparse_df = pd.DataFrame(sparse_data)

# 3. Data Preprocessing

### Check whether every stock-year pair is unique

In [43]:
def check_unique_stock_year_pairs(df, stock_col='Stock', year_col='Year'):
    # Create a combined column to check for duplicates
    unique_combinations = df[[stock_col, year_col]].drop_duplicates()

    # Compare the length of unique combinations with the original DataFrame
    if len(unique_combinations) == len(df):
        print("All stock-year pairs are unique.")
        return True
    else:
        print("There are duplicate stock-year pairs.")
        # Keep no duplicates to ensure only rows with unique Stock-Year pairs remain
        duplicates = df[df.duplicated(subset=[stock_col, year_col], keep=False)]
        print("Duplicate entries:")
        print(duplicates)
        print(duplicates.shape)
        return False

In [44]:
check_unique_stock_year_pairs(expanded_df)

All stock-year pairs are unique.


True

In [45]:
check_unique_stock_year_pairs(partial_df)

All stock-year pairs are unique.


True

In [46]:
# 1217 duplicate pairs
check_unique_stock_year_pairs(historical_df)

There are duplicate stock-year pairs.
Duplicate entries:
        Stock  Year       Revenue  Cost of Revenue  Gross_Profit  Gross Profit
4       DUBLF  2149  1.564826e+09              NaN           NaN           NaN
20     MGNABA  2134  1.880840e+09     1.030426e+09  8.504137e+08           NaN
32     FMTVPY  2178  6.908849e+08     3.754662e+08  3.154187e+08           NaN
38       MCNC  2140  4.908585e+08     3.340902e+08  1.567683e+08           NaN
64       HYNB  2161           NaN              NaN           NaN           NaN
...       ...   ...           ...              ...           ...           ...
19943    PIXU  2169  1.183188e+09     5.678716e+08           NaN  6.153159e+08
19973   TYSAP  2171  3.227322e+07     1.418402e+07           NaN  1.808920e+07
19974  DMCKPU  2140  1.735987e+09     1.050246e+09           NaN  6.857412e+08
19979  VDVEWR  2152  1.082795e+09     5.963902e+08           NaN  4.864046e+08
19988  SNBDXN  2196  3.257110e+08              NaN           NaN          

False

In [47]:
# 4 duplicate pairs
check_unique_stock_year_pairs(sparse_df)

There are duplicate stock-year pairs.
Duplicate entries:
    Stock  Year     Revenue
263  LALI  2005   176762369
264  LALI  2007   661785285
288  LALI  2005  1596116051
290  LALI  2007  1004175337
(4, 3)


False

### Dropping duplicate stock-year pairs

In [48]:
def drop_duplicate_stock_year_pairs(df, stock_col='Stock', year_col='Year'):
    print("DataFrame after dropping all duplicates:")
    return df[~df.duplicated(subset=['Stock', 'Year'], keep=False)]

In [51]:
historical_df = drop_duplicate_stock_year_pairs(historical_df)
print(historical_df)

DataFrame after dropping all duplicates:
        Stock  Year       Revenue  Cost of Revenue  Gross_Profit  Gross Profit
0       GOLXD  2184  9.680579e+08     6.308393e+08  3.372186e+08           NaN
1       OKDZW  2192  4.770210e+08     2.977351e+08  1.792859e+08           NaN
2       PGOPC  2168  9.445676e+08     6.795297e+08  2.650379e+08           NaN
3        FSPF  2151  1.623014e+08     1.034746e+08  5.882680e+07           NaN
5       KDMTE  2198  6.519775e+08     4.517509e+08  2.002266e+08           NaN
...       ...   ...           ...              ...           ...           ...
19995  DUOJBW  2162  1.739624e+09     5.799098e+08           NaN  1.159714e+09
19996   RZYXV  2194  2.511675e+08     1.142958e+08           NaN  1.368718e+08
19997   MXKTG  2138           NaN              NaN           NaN           NaN
19998   XGMTN  2136           NaN              NaN           NaN           NaN
19999   XEZIM  2160           NaN              NaN           NaN           NaN

[18783 row

In [49]:
sparse_df = drop_duplicate_stock_year_pairs(sparse_df)
print(sparse_df)

DataFrame after dropping all duplicates:
      Stock  Year     Revenue
0      JGPF  2002   533645067
1      JGPF  2003   191962652
2     HSCGS  2001   300527108
3     HSCGS  2002  1486372237
4     HSCGS  2005  1350321861
..      ...   ...         ...
442  ZTMXAP  2001  1360600396
443  ZTMXAP  2003   257623495
444  ZTMXAP  2005   483787674
445  ZTMXAP  2006   793009584
446  ZTMXAP  2008  1808217019

[443 rows x 3 columns]


### Check for duplicates

In [52]:
# Base dataset
print(f"Duplicate entries (Base Dataset): {expanded_df.duplicated().sum()}")
print(f"{round((expanded_df.duplicated().sum() / partial_df.shape[0]) * 100, 2)}% rows are duplicate.")

# Partial dataset
print(f"Duplicate entries (Partial Dataset): {partial_df.duplicated().sum()}")
print(f"{round((partial_df.duplicated().sum() / partial_df.shape[0]) * 100, 2)}% rows are duplicate.")

# Historical dataset
print(f"Duplicate entries (Historical Dataset): {historical_df.duplicated().sum()}")
print(f"{round((historical_df.duplicated().sum() / historical_df.shape[0]) * 100, 2)}% rows are duplicate.")

# Sparse dataset
print(f"Duplicate entries (Sparse Dataset): {sparse_df.duplicated().sum()}")
print(f"{round((sparse_df.duplicated().sum() / sparse_df.shape[0]) * 100, 2)}% rows are duplicate.")

Duplicate entries (Base Dataset): 0
0.0% rows are duplicate.
Duplicate entries (Partial Dataset): 0
0.0% rows are duplicate.
Duplicate entries (Historical Dataset): 0
0.0% rows are duplicate.
Duplicate entries (Sparse Dataset): 0
0.0% rows are duplicate.


### Check for inconsistencies

This function prints the number of unique values per column. From the unique values, we can also tell any inconsistencies such as impossible negatives in revenue that we need to manage.

In [53]:
# Loop through all columns and print unique values
def no_of_unique_values(df):
    for col in df.columns:
        print(f"Column Name (Base Dataset): {col}")
        print(f"Unique Values: {df[col].unique()}")
        print(f"Number of Unique Values: {df[col].nunique()}")
        print('-' * 60) # To demarcate columns

In [54]:
no_of_unique_values(expanded_df)

Column Name (Base Dataset): Stock
Unique Values: ['BRK-A' 'NVR' 'BKNG' ... 'AEZUE' 'RJLN' 'TTCZLN']
Number of Unique Values: 4049
------------------------------------------------------------
Column Name (Base Dataset): Year
Unique Values: [2023 2022 2021 2020 2019 2024 2018 2017 2000 2006 2004 2002 2014 2001
 2007 2012 2010 2008 2016 2013 2015 2003 2005 2009 2011]
Number of Unique Values: 25
------------------------------------------------------------
Column Name (Base Dataset): Revenue
Unique Values: [3.64482000e+11 2.34190000e+11 3.54636000e+11 ... 1.33376586e+09
 1.54838758e+09 4.91265446e+08]
Number of Unique Values: 100235
------------------------------------------------------------
Column Name (Base Dataset): Revenue Growth
Unique Values: [        nan  0.1389629   0.03777467 ... -0.08242593  0.14503091
  0.17837523]
Number of Unique Values: 100000
------------------------------------------------------------
Column Name (Base Dataset): Operating Income
Unique Values: [ 1.23196000e

In [57]:
no_of_unique_values(partial_df)

Column Name (Base Dataset): Stock
Unique Values: ['BRK-A' 'NVR' 'BKNG' ... 'ATJUK' 'NJCE' 'RIORFE']
Number of Unique Values: 4048
------------------------------------------------------------
Column Name (Base Dataset): Year
Unique Values: [2023 2022]
Number of Unique Values: 2
------------------------------------------------------------
Column Name (Base Dataset): Revenue
Unique Values: [3.64482000e+11            nan 9.53488900e+09 ... 2.75076875e+08
 4.15520636e+08 1.54838758e+09]
Number of Unique Values: 6476
------------------------------------------------------------
Column Name (Base Dataset): Revenue Growth
Unique Values: [       nan 0.03180095 0.05889517 ... 0.02069398 0.00177194 0.14503091]
Number of Unique Values: 8002
------------------------------------------------------------
Column Name (Base Dataset): Operating Income
Unique Values: [ 1.23196000e+11 -2.69850000e+10  1.82245500e+09 ...  3.27766620e+06
  5.36752494e+07  2.10674763e+08]
Number of Unique Values: 8093
--------

Significant inconsistencies in the Historical dataset include:
1. Impossible negatives
2. Impossible years (.e.g 2177)
3. Stock names with '.end'

In [58]:
no_of_unique_values(historical_df)

Column Name (Base Dataset): Stock
Unique Values: ['GOLXD' 'OKDZW' 'PGOPC' ... 'NKZRR .end' 'TFWAN .end' 'NSYB .end']
Number of Unique Values: 4883
------------------------------------------------------------
Column Name (Base Dataset): Year
Unique Values: [2184 2192 2168 2151 2198 2154 2197 2014 2185 2165 2182 2171 2195 2017
 2177 2141 2199 2163 2131 2146 2132 2183 2011 2137 2178 2149 2190 2173
 2162 2159 2161 2166 2015 2179 2181 2152 2191 2193 2156 2135 2169 2133
 2157 2145 2148 2167 2138 2016 2147 2136 2140 2018 2180 2130 2175 2144
 2020 2134 2196 2164 2019 2194 2012 2189 2176 2170 2172 2174 2158 2160
 2143 2155 2187 2139 2013 2188 2153 2150 2010 2186 2142]
Number of Unique Values: 81
------------------------------------------------------------
Column Name (Base Dataset): Revenue
Unique Values: [9.68057898e+08 4.77020998e+08 9.44567602e+08 ... 4.27379260e+08
 1.73962391e+09 2.51167538e+08]
Number of Unique Values: 15217
------------------------------------------------------------
Col

In [56]:
no_of_unique_values(sparse_df)

Column Name (Base Dataset): Stock
Unique Values: ['JGPF' 'HSCGS' 'LUWJK' 'JTLCVH' 'LZZC' 'MWSS' 'MRPLYG' 'CWDAN' 'HZLRUE'
 'WRZLM' 'XFPB' 'OIPJTN' 'GMUIRW' 'OFXJW' 'VXBRR' 'QYQWK' 'TCFKRN' 'BRCA'
 'ZPYYZ' 'AKST' 'DDDVA' 'WKNKX' 'UEEFE' 'VIYELP' 'UUQYV' 'DNVDJ' 'HKGAD'
 'BILXVJ' 'WLXAXZ' 'NJOZ' 'FRBCIS' 'RZYXV' 'PIVFKR' 'DONBI' 'EIBIXF'
 'LJQTY' 'JKHS' 'YEJA' 'MRGA' 'IBWX' 'TVSZY' 'QCVFL' 'YUJYCD' 'AKQIW'
 'DKAOAD' 'IYRY' 'IAYS' 'WADDHX' 'PVEZUI' 'RLHBXG' 'ESWP' 'FNAGW' 'NILZVW'
 'HQWC' 'GSTU' 'WLVCSU' 'URVDB' 'MPZVM' 'OIQBP' 'LYLV' 'LALI' 'PVTD'
 'YNBFK' 'HMBZZ' 'VBHMK' 'PJTU' 'JQDDKQ' 'YEHHW' 'GIKR' 'AHSBJZ' 'OWKXL'
 'UKDBVP' 'YFEWQ' 'MUBU' 'CPDIV' 'ACKMX' 'XLNCQ' 'GXZVB' 'OYHNF' 'BBQJR'
 'WCLDW' 'WXOZR' 'JYGE' 'WREJO' 'RYXNHM' 'MZFOY' 'RSTPL' 'MKDRCL' 'JJPICI'
 'QIWX' 'QJCDKV' 'DBPFE' 'BFPVXC' 'LSUJ' 'GDHC' 'EIZCGW' 'LVAB' 'AWNVTB'
 'ZTMXAP']
Number of Unique Values: 99
------------------------------------------------------------
Column Name (Base Dataset): Year
Unique Values: [2002 

We also check the dtypes and basic info of each column for any null columns.

In [59]:
print('Base Dataset') #Check revenue growth for null
print(expanded_df.dtypes)
print(expanded_df.info())
print('-' * 60)

print('Partial Dataset') #Check revenue, revenue growth, net income, operating expenses for null
print(partial_df.dtypes)
print(partial_df.info())
print('-' * 60)

print('Historical Dataset') #Check revenue, cost of revenue, gross profit for null
print(historical_df.dtypes) 
print(historical_df.info())
print('-' * 60)

print('Sparse Dataset')
print(sparse_df.dtypes)
print(sparse_df.info())
print('-' * 60)

Base Dataset
Stock                           object
Year                             int64
Revenue                        float64
Revenue Growth                 float64
Operating Income               float64
Net Income                     float64
Net Income Ratio               float64
Gross Profit                   float64
Gross Profit Ratio             float64
Operating Expenses             float64
Cost of Revenue                float64
EBITDA                         float64
Interest Expense               float64
Depreciation & Amortization    float64
dtype: object
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100240 entries, 0 to 100239
Data columns (total 14 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Stock                        100240 non-null  object 
 1   Year                         100240 non-null  int64  
 2   Revenue                      100240 non-null  float64
 3   Revenue Growth  

### Check and filter out impossible years

This function checks whether years in the 'year' column are greater than a maximum year.

The 'year_column' is a keyword argument that fits the 'year' column. 'max_year' is also a keyword argument that represents the max year.

In this case, we find out whether any column exceeds year 2024.

In [60]:
def check_years(df, year_column, max_year):
    
    # Flag rows where the year is out of range
    df['is_out_of_range'] = df[year_column] > max_year

    # out_of_range represents years that exceed max_year
    # valid_rows represents years before year max_year
    out_of_range = df[df['is_out_of_range']]
    valid_rows = df[~df['is_out_of_range']]
    
    return {
        "out_of_range": out_of_range,
        "valid_rows": valid_rows,
        "flagged_df": df
    }

In [61]:
expanded_df_years = check_years(expanded_df, year_column='Year', max_year=2024)
print("Out-of-range rows:")
print(expanded_df_years["out_of_range"])

print("\nValid rows:")
print(expanded_df_years["valid_rows"])

print("\nDataframe with flagged column:")
print(expanded_df_years["flagged_df"])

Out-of-range rows:
Empty DataFrame
Columns: [Stock, Year, Revenue, Revenue Growth, Operating Income, Net Income, Net Income Ratio, Gross Profit, Gross Profit Ratio, Operating Expenses, Cost of Revenue, EBITDA, Interest Expense, Depreciation & Amortization, is_out_of_range]
Index: []

Valid rows:
         Stock  Year       Revenue  Revenue Growth  Operating Income  \
0        BRK-A  2023  3.644820e+11             NaN      1.231960e+11   
1        BRK-A  2022  2.341900e+11             NaN     -2.698500e+10   
2        BRK-A  2021  3.546360e+11             NaN     -8.242500e+10   
3        BRK-A  2020  2.862560e+11             NaN     -1.135820e+11   
4        BRK-A  2019  3.272230e+11             NaN     -7.723400e+10   
...        ...   ...           ...             ...               ...   
100235  FMKJRM  2017  1.669641e+09       -0.033992      4.244411e+08   
100236    IWCM  2024  2.011885e+08       -0.052957      2.713820e+07   
100237   FBBAT  2003  1.333766e+09       -0.082426     

In [62]:
partial_df_years = check_years(partial_df, year_column='Year', max_year=2024)
print("Out-of-range rows:")
print(partial_df_years["out_of_range"])

print("\nValid rows:")
print(partial_df_years["valid_rows"])

print("\nDataframe with flagged column:")
print(partial_df_years["flagged_df"])

Out-of-range rows:
Empty DataFrame
Columns: [Stock, Year, Revenue, Revenue Growth, Operating Income, Net Income, Net Income Ratio, Gross Profit, Gross Profit Ratio, Operating Expenses, Cost of Revenue, EBITDA, Interest Expense, Depreciation & Amortization, is_out_of_range]
Index: []

Valid rows:
         Stock  Year       Revenue  Revenue Growth  Operating Income  \
0        BRK-A  2023  3.644820e+11             NaN      1.231960e+11   
1        BRK-A  2022           NaN             NaN     -2.698500e+10   
5          NVR  2023  9.534889e+09             NaN      1.822455e+09   
6          NVR  2022  1.053829e+10             NaN      2.169184e+09   
10        BKNG  2023  2.136500e+10             NaN      5.835000e+09   
...        ...   ...           ...             ...               ...   
100156    CMMY  2022  2.239456e+09        0.099316      1.254194e+08   
100163    MUMO  2023  1.037629e+09        0.181497      1.823199e+08   
100165  PGVKAG  2023  2.750769e+08        0.020694     

17822 rows in Historical Dataset are out of range.

In [63]:
historical_df_years = check_years(historical_df, year_column='Year', max_year=2024)
print("Out-of-range rows:")
print(historical_df_years["out_of_range"])

print("\nValid rows:")
print(historical_df_years["valid_rows"])

print("\nDataframe with flagged column:")
print(historical_df_years["flagged_df"])

Out-of-range rows:
        Stock  Year       Revenue  Cost of Revenue  Gross_Profit  \
0       GOLXD  2184  9.680579e+08     6.308393e+08  3.372186e+08   
1       OKDZW  2192  4.770210e+08     2.977351e+08  1.792859e+08   
2       PGOPC  2168  9.445676e+08     6.795297e+08  2.650379e+08   
3        FSPF  2151  1.623014e+08     1.034746e+08  5.882680e+07   
5       KDMTE  2198  6.519775e+08     4.517509e+08  2.002266e+08   
...       ...   ...           ...              ...           ...   
19995  DUOJBW  2162  1.739624e+09     5.799098e+08           NaN   
19996   RZYXV  2194  2.511675e+08     1.142958e+08           NaN   
19997   MXKTG  2138           NaN              NaN           NaN   
19998   XGMTN  2136           NaN              NaN           NaN   
19999   XEZIM  2160           NaN              NaN           NaN   

       Gross Profit  is_out_of_range  
0               NaN             True  
1               NaN             True  
2               NaN             True  
3       

We filter out rows that are out of range, and keep only rows within the valid range of until 2024.

Unique values in the 'Year' column are checked again.

In [64]:
historical_df = historical_df.loc[historical_df_years['valid_rows'].index]

In [65]:
historical_df['Year'].unique()

array([2014, 2017, 2011, 2015, 2016, 2018, 2020, 2019, 2012, 2013, 2010],
      dtype=int64)

In [66]:
sparse_df_years = check_years(sparse_df, year_column='Year', max_year=2099)
print("Out-of-range rows:")
print(sparse_df_years["out_of_range"])

print("\nValid rows:")
print(sparse_df_years["valid_rows"])

print("\nDataframe with flagged column:")
print(sparse_df_years["flagged_df"])

Out-of-range rows:
Empty DataFrame
Columns: [Stock, Year, Revenue, is_out_of_range]
Index: []

Valid rows:
      Stock  Year     Revenue  is_out_of_range
0      JGPF  2002   533645067            False
1      JGPF  2003   191962652            False
2     HSCGS  2001   300527108            False
3     HSCGS  2002  1486372237            False
4     HSCGS  2005  1350321861            False
..      ...   ...         ...              ...
442  ZTMXAP  2001  1360600396            False
443  ZTMXAP  2003   257623495            False
444  ZTMXAP  2005   483787674            False
445  ZTMXAP  2006   793009584            False
446  ZTMXAP  2008  1808217019            False

[443 rows x 4 columns]

Dataframe with flagged column:
      Stock  Year     Revenue  is_out_of_range
0      JGPF  2002   533645067            False
1      JGPF  2003   191962652            False
2     HSCGS  2001   300527108            False
3     HSCGS  2002  1486372237            False
4     HSCGS  2005  1350321861         

### Rename values in 'Stock' column that end with ' .end' in Historical Dataset

We replace both '.end' and ' .end' at the end of the string using regex.

After that, we print out the stock names to see whether they are all named correctly.

In [68]:
historical_df['Stock'] = historical_df['Stock'].str.replace(r'\s?\.end$', '', regex=True)

In [69]:
historical_df['Stock'].unique()

array(['TBION', 'HMUXK', 'UWYVPZ', 'SEOBWL', 'KHNKA', 'ZONI', 'TQWB',
       'SYART', 'XCWI', 'DKFS', 'AHOLHZ', 'SNSWR', 'ESZU', 'CBENTE',
       'BAET', 'WUAPX', 'MJJJT', 'RHWJ', 'GEQJW', 'LJVL', 'MHTQR',
       'UVOIB', 'PMUY', 'QRDU', 'TUWQO', 'HPZUZ', 'BODXS', 'TBLHW',
       'BOOGWT', 'MSLC', 'CSXRFF', 'IGLP', 'CFRQ', 'BGCVR', 'DBOZU',
       'GQESMN', 'IMLWA', 'ZIGNSP', 'ZMNUXG', 'NBQK', 'CKHS', 'KBTPAN',
       'NIQTOM', 'MJVLN', 'PCOSBN', 'MTTH', 'AVNCI', 'HWEVQX', 'XGYW',
       'QOCDZY', 'INZP', 'NOVZI', 'PGEFK', 'FWSPZB', 'BMGM', 'JXREJ',
       'OTRHID', 'XDCHVO', 'ADLLZX', 'QDZH', 'XRZQ', 'XNMM', 'AEXZMU',
       'FFRAVF', 'YFIZ', 'WRLLF', 'PDLVXN', 'FDWEU', 'NTZQO', 'JBQBJ',
       'MQBHG', 'MQPN', 'AMVT', 'DOQGLN', 'MLWODF', 'MHKEQ', 'WBAC',
       'XRGJ', 'NKBF', 'UUTDFC', 'EODSLX', 'QMWW', 'VDXZN', 'OXNZ',
       'EYPA', 'XBSY', 'CKZS', 'WOYU', 'XPHLDB', 'XNEG', 'LGIGR', 'RRGZ',
       'UXRNLG', 'KBAKCF', 'PJUGAF', 'BJQIZN', 'IMBWSH', 'YEHR', 'IELVMR',
       'TMEB', '

### Check for impossible negative values

We know that the following columns cannot have negative values:
**Revenue, Operating Expenses, Cost of Revenue, Interest Expense, Depreciation & Amortization**

We iterate this function over each column to check for any impossible negative values.

We then create a valid_columns list to only keep rows with no impossible negative values.

In [41]:
def drop_rows_with_negatives(df):
    columns_to_check = ['Revenue', 'Operating Expenses', 'Cost of Revenue', 'Interest Expense', 'Depreciation & Amortization']

    # For each column within columns that are not supposed to have negative values
    for column in columns_to_check:
        if column in df.columns: # Check first if the column exist in the datasset
            # Check for negative values in the column
            has_negative = df[column].lt(0).any()
            if has_negative:
                print(f"Column '{column}' contains negative values.")
            else:
                print(f"Column '{column} does not contain any negative values.'")
        else: # If the column is not present in the dataset, we skip it
            print(f"Column '{column}' is not present in the DataFrame. Skipping.")
            
    # Only keep rows with no impossible negative values
    valid_columns = [col for col in columns_to_check if col in df.columns]
    rows_with_negatives = df[df[valid_columns].lt(0).any(axis=1)]
    return df.drop(rows_with_negatives.index)

In [42]:
expanded_df = drop_rows_with_negatives(expanded_df)

Column 'Revenue does not contain any negative values.'
Column 'Operating Expenses' contains negative values.
Column 'Cost of Revenue' contains negative values.
Column 'Interest Expense' contains negative values.
Column 'Depreciation & Amortization' contains negative values.


In [43]:
partial_df = drop_rows_with_negatives(partial_df)

Column 'Revenue does not contain any negative values.'
Column 'Operating Expenses' contains negative values.
Column 'Cost of Revenue' contains negative values.
Column 'Interest Expense does not contain any negative values.'
Column 'Depreciation & Amortization' contains negative values.


In [44]:
historical_df = drop_rows_with_negatives(historical_df)

Column 'Revenue' contains negative values.
Column 'Operating Expenses' is not present in the DataFrame. Skipping.
Column 'Cost of Revenue' contains negative values.
Column 'Interest Expense' is not present in the DataFrame. Skipping.
Column 'Depreciation & Amortization' is not present in the DataFrame. Skipping.


In [45]:
sparse_df = drop_rows_with_negatives(sparse_df)

Column 'Revenue does not contain any negative values.'
Column 'Operating Expenses' is not present in the DataFrame. Skipping.
Column 'Cost of Revenue' is not present in the DataFrame. Skipping.
Column 'Interest Expense' is not present in the DataFrame. Skipping.
Column 'Depreciation & Amortization' is not present in the DataFrame. Skipping.


### Dealing with missing values

In [46]:
# Calculate the percentage of missing values
def missing_values_check(df):
    missing_values = df.isnull().sum()
    percent = (missing_values * 100) / len(df)
    
    # Create a DataFrame to display missing values information
    missing_df = pd.DataFrame({
        'Column': df.columns,
        'No. of Missing Values': missing_values.values,
        'Percentage': percent.values
    }).sort_values(by='Percentage', ascending=False)

    return missing_df

In [47]:
print(missing_values_check(expanded_df))
print('-'*80)
print(missing_values_check(partial_df))
print('-'*80)
print(missing_values_check(historical_df))
print('-'*80)
print(missing_values_check(sparse_df))

                         Column  No. of Missing Values  Percentage
3                Revenue Growth                    213    0.212547
0                         Stock                      0    0.000000
1                          Year                      0    0.000000
2                       Revenue                      0    0.000000
4              Operating Income                      0    0.000000
5                    Net Income                      0    0.000000
6              Net Income Ratio                      0    0.000000
7                  Gross Profit                      0    0.000000
8            Gross Profit Ratio                      0    0.000000
9            Operating Expenses                      0    0.000000
10              Cost of Revenue                      0    0.000000
11                       EBITDA                      0    0.000000
12             Interest Expense                      0    0.000000
13  Depreciation & Amortization                      0    0.00

The number of missing values in the Partial and Historical dataset hold a significant percentage of the datasets, which affects analysis later on.

We will try to fill in the missing values as best as we can, based on previous year revenues or calculations between columns.

### Managing Missing Values

### Expanded Dataset: Revenue Growth

In [48]:
# For each stock, fill in revenue growth NaN based on revenue in the previous year
# Ensure 'Year' column is in integer format
expanded_df['Year'] = expanded_df['Year'].astype(int)

# Sort by stock and year
expanded_df = expanded_df.sort_values(by=['Stock', 'Year'])

# Calculate revenue growth where it is NaN
expanded_df['Revenue Growth'] = expanded_df.groupby('Stock')['Revenue'].apply(
    lambda x: x.pct_change(fill_method=None).fillna(0)
).reset_index(drop=True)

In [49]:
# Confirm that the only NaN left are stocks with no prior revenues
expanded_df_NaN = expanded_df[expanded_df['Revenue Growth'].isna()]

In [50]:
expanded_df_NaN

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,Cost of Revenue,EBITDA,Interest Expense,Depreciation & Amortization,is_out_of_range
100217,AODFB,2004,1780660000.0,,468750400.0,468463600.0,0.263084,936921600.0,0.526165,468171200.0,843738600.0,477084400.0,286758.0,8333984.0,False
100232,AOFZYV,2016,275026700.0,,93703070.0,92709960.0,0.337094,150874400.0,0.548581,57171340.0,124152300.0,100593600.0,993114.0,6890492.0,False
100226,BMQJ,2020,843116800.0,,245218300.0,244892600.0,0.290461,498076000.0,0.590756,252857700.0,345040800.0,251734500.0,325753.0,6516184.0,False
100237,FBBAT,2003,1333766000.0,,233408700.0,232894000.0,0.174614,681125300.0,0.510678,447716600.0,652640600.0,240057100.0,514686.0,6648408.0,False
100235,FMKJRM,2017,1669641000.0,,424441100.0,423755800.0,0.253801,794979700.0,0.476138,370538700.0,874660900.0,428400500.0,685287.0,3959418.0,False
100223,GXXAE,2018,439924400.0,,104331600.0,104215000.0,0.236893,256016100.0,0.581955,151684400.0,183908300.0,110502900.0,116655.0,6171271.0,False
100227,HDOIIU,2024,1180558000.0,,211958900.0,211190400.0,0.17889,621916900.0,0.526799,409958100.0,558640900.0,219729800.0,768468.0,7770933.0,False
100228,HNECUE,2007,1789047000.0,,123013400.0,122808700.0,0.068645,600744300.0,0.33579,477730900.0,1188303000.0,126393800.0,204687.0,3380443.0,False
100231,HXFIYE,2020,430351700.0,,122859700.0,122343900.0,0.284288,244712400.0,0.568634,121852700.0,185639300.0,129310900.0,515872.0,6451176.0,False
100234,IBOIC,2014,633549500.0,,64079810.0,63661100.0,0.100483,300400900.0,0.474155,236321100.0,333148600.0,69262070.0,418713.0,5182259.0,False


### Partial Dataset: Operating Expenses, Revenue, Net Income, Revenue Growth

In [18]:
partial_df.columns

Index(['Stock', 'Year', 'Revenue', 'Revenue Growth', 'Operating Income',
       'Net Income', 'Net Income Ratio', 'Gross Profit', 'Gross Profit Ratio',
       'Operating Expenses', 'Cost of Revenue', 'EBITDA', 'Interest Expense',
       'Depreciation & Amortization', 'is_out_of_range'],
      dtype='object')

In [51]:
partial_df[['Operating Expenses', 'Interest Expense', 'Net Income', 'Revenue Growth']]

Unnamed: 0,Operating Expenses,Interest Expense,Net Income,Revenue Growth
0,1.985750e+11,5.003000e+09,,
1,,4.352000e+09,-2.281900e+10,
2,,2.774000e+07,1.591611e+09,
3,,3.952400e+07,1.725575e+09,
4,,8.970000e+08,4.289000e+09,
...,...,...,...,...
8091,8.673782e+08,3.752040e+05,1.250442e+08,0.099316
8092,3.170169e+08,3.799450e+05,1.819399e+08,0.181497
8093,1.070119e+08,4.119530e+05,2.865713e+06,0.020694
8094,1.300899e+08,3.788180e+05,,0.001772


#### Operating Expenses = Gross Profit - Operating Income

In [52]:
# Fill NaN in 'Operating Expenses' only if both 'Gross Profit' and 'Operating Income' are available
partial_df['Operating Expenses'] = partial_df.apply(
    lambda row: row['Gross Profit'] - row['Operating Income']
    if pd.isna(row['Operating Expenses']) and pd.notna(row['Gross Profit']) and pd.notna(row['Operating Income'])
    else row['Operating Expenses'], axis=1
)

In [53]:
partial_df[['Operating Expenses', 'Gross Profit', 'Operating Income']]

Unnamed: 0,Operating Expenses,Gross Profit,Operating Income
0,1.985750e+11,1.231960e+11,1.231960e+11
1,1.368560e+11,1.098710e+11,-2.698500e+10
2,6.612360e+08,2.483691e+09,1.822455e+09
3,7.068320e+08,2.876016e+09,2.169184e+09
4,1.552500e+10,2.136000e+10,5.835000e+09
...,...,...,...
8091,8.673782e+08,9.927977e+08,1.254194e+08
8092,3.170169e+08,4.993368e+08,1.823199e+08
8093,1.070119e+08,1.102896e+08,3.277666e+06
8094,1.300899e+08,1.837651e+08,5.367525e+07


In [54]:
# Check if Operating Expenses still has NaN
if partial_df['Operating Expenses'].isna().any():
    print("Yes, there are still NaN values.")
    print(partial_df[partial_df['Operating Expenses'].isna()])
else:
    print("No more NaN values")

No more NaN values


In [55]:
print(missing_values_check(partial_df))

                         Column  No. of Missing Values  Percentage
2                       Revenue                   1619   20.004943
5                    Net Income                    810   10.008649
3                Revenue Growth                     91    1.124429
0                         Stock                      0    0.000000
1                          Year                      0    0.000000
4              Operating Income                      0    0.000000
6              Net Income Ratio                      0    0.000000
7                  Gross Profit                      0    0.000000
8            Gross Profit Ratio                      0    0.000000
9            Operating Expenses                      0    0.000000
10              Cost of Revenue                      0    0.000000
11                       EBITDA                      0    0.000000
12             Interest Expense                      0    0.000000
13  Depreciation & Amortization                      0    0.00

#### Net Income = Operating Income - Interest Expense

In [57]:
# Fill NaN in 'Net Income' only when Operating Income, Interest Expense are available
partial_df['Net Income'] = partial_df.apply(
    lambda row: row['Operating Income'] - row['Interest Expense']
    if pd.isna(row['Net Income']) and pd.notna(row['Operating Income']) and pd.notna(row['Interest Expense'])
    else row['Net Income'],
    axis=1
)

In [58]:
partial_df[['Net Income', 'Operating Income', 'Interest Expense']]

Unnamed: 0,Net Income,Operating Income,Interest Expense
0,1.181930e+11,1.231960e+11,5.003000e+09
1,-2.281900e+10,-2.698500e+10,4.352000e+09
2,1.591611e+09,1.822455e+09,2.774000e+07
3,1.725575e+09,2.169184e+09,3.952400e+07
4,4.289000e+09,5.835000e+09,8.970000e+08
...,...,...,...
8091,1.250442e+08,1.254194e+08,3.752040e+05
8092,1.819399e+08,1.823199e+08,3.799450e+05
8093,2.865713e+06,3.277666e+06,4.119530e+05
8094,5.329643e+07,5.367525e+07,3.788180e+05


In [59]:
# Check if Net Income still has NaN
if partial_df['Net Income'].isna().any():
    print("Yes, there are still NaN values.")
    print(partial_df[partial_df['Net Income'].isna()])
else:
    print("No more NaN values")

No more NaN values


In [60]:
print(missing_values_check(partial_df))

                         Column  No. of Missing Values  Percentage
2                       Revenue                   1619   20.004943
3                Revenue Growth                     91    1.124429
0                         Stock                      0    0.000000
1                          Year                      0    0.000000
4              Operating Income                      0    0.000000
5                    Net Income                      0    0.000000
6              Net Income Ratio                      0    0.000000
7                  Gross Profit                      0    0.000000
8            Gross Profit Ratio                      0    0.000000
9            Operating Expenses                      0    0.000000
10              Cost of Revenue                      0    0.000000
11                       EBITDA                      0    0.000000
12             Interest Expense                      0    0.000000
13  Depreciation & Amortization                      0    0.00

#### Revenue = Gross Profit + Cost of Revenue

In [61]:
# Fill NaN in 'Revenue' only when Gross Profit and Cost of Revenue are available
partial_df['Revenue'] = partial_df.apply(
    lambda row: row['Gross Profit'] + row['Cost of Revenue']
    if pd.isna(row['Revenue']) and pd.notna(row['Gross Profit']) and pd.notna(row['Cost of Revenue'])
    else row['Revenue'],
    axis=1
)

In [62]:
partial_df[['Revenue', 'Gross Profit', 'Cost of Revenue']]

Unnamed: 0,Revenue,Gross Profit,Cost of Revenue
0,3.644820e+11,1.231960e+11,2.412860e+11
1,2.341900e+11,1.098710e+11,1.243190e+11
2,9.534889e+09,2.483691e+09,7.051198e+09
3,1.053829e+10,2.876016e+09,7.662271e+09
4,2.136500e+10,2.136000e+10,5.000000e+06
...,...,...,...
8091,2.239456e+09,9.927977e+08,1.246658e+09
8092,1.037629e+09,4.993368e+08,5.382918e+08
8093,2.750769e+08,1.102896e+08,1.647873e+08
8094,4.155206e+08,1.837651e+08,2.317555e+08


In [63]:
# Check if Revenue still has NaN values
if partial_df['Revenue'].isna().any():
    print("Yes, there are still NaN values.")
    print(partial_df[partial_df['Revenue'].isna()])
else:
    print("No more NaN values")

No more NaN values


In [64]:
print(missing_values_check(partial_df))

                         Column  No. of Missing Values  Percentage
3                Revenue Growth                     91    1.124429
0                         Stock                      0    0.000000
1                          Year                      0    0.000000
2                       Revenue                      0    0.000000
4              Operating Income                      0    0.000000
5                    Net Income                      0    0.000000
6              Net Income Ratio                      0    0.000000
7                  Gross Profit                      0    0.000000
8            Gross Profit Ratio                      0    0.000000
9            Operating Expenses                      0    0.000000
10              Cost of Revenue                      0    0.000000
11                       EBITDA                      0    0.000000
12             Interest Expense                      0    0.000000
13  Depreciation & Amortization                      0    0.00

#### Revenue Growth = % Change in Revenue from Previous Year

In [65]:
# For each stock, fill in revenue growth NaN based on revenue in the previous year
# Ensure 'Year' column is in integer format
partial_df['Year'] = partial_df['Year'].astype(int)

# Sort by stock and year
partial_df = partial_df.sort_values(by=['Stock', 'Year'])

# Calculate revenue growth where it is NaN
partial_df['Revenue Growth'] = partial_df.groupby('Stock')['Revenue'].apply(
    lambda x: x.pct_change().fillna(0)
).reset_index(level=0, drop=True)  # Reset the group index, drop the group level

In [66]:
partial_df[['Stock', 'Year', 'Revenue', 'Revenue Growth']]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth
5007,AAFRW,2022,7.097898e+08,0.000000
130,AAFRW,2023,2.947645e+07,-0.958472
1077,AAHFBX,2022,1.148219e+09,0.000000
2326,AAHFBX,2023,1.223013e+09,0.065139
2447,AALB,2022,1.012524e+09,0.000000
...,...,...,...,...
2011,ZZWA,2023,1.292421e+09,-0.302745
5625,ZZZBKF,2022,2.321441e+08,0.000000
753,ZZZBKF,2023,3.261039e+08,0.404748
6105,ZZZBQ,2022,1.128191e+09,0.000000


### Historical Dataset: Gross Profit, Cost of Revenue, Revenue

In [68]:
print(missing_values_check(historical_df))

            Column  No. of Missing Values  Percentage
4     Gross_Profit                    599   65.250545
5     Gross Profit                    580   63.180828
3  Cost of Revenue                    261   28.431373
2          Revenue                    185   20.152505
0            Stock                      0    0.000000
1             Year                      0    0.000000
6  is_out_of_range                      0    0.000000


At this point, we also drop the 'Gross_Profit' column as it is a duplicate to the actual 'Gross Profit'.

In [69]:
historical_df = historical_df.drop(columns=['Gross_Profit'])

#### Gross Profit = Revenue - Cost of Revenue

In [70]:
# Fill NaN in 'Gross Profit' only when Revenue and Cost of Revenue are available
historical_df['Gross Profit'] = historical_df.apply(
    lambda row: row['Revenue'] - row['Cost of Revenue']
    if pd.isna(row['Gross Profit']) and pd.notna(row['Revenue']) and pd.notna(row['Cost of Revenue'])
    else row['Gross Profit'],
    axis=1
)

In [71]:
print(missing_values_check(historical_df))

            Column  No. of Missing Values  Percentage
3  Cost of Revenue                    261   28.431373
4     Gross Profit                    261   28.431373
2          Revenue                    185   20.152505
0            Stock                      0    0.000000
1             Year                      0    0.000000
5  is_out_of_range                      0    0.000000


#### Cost of Revenue = Revenue - Gross Profit

In [72]:
# Fill NaN in 'Cost of Revenue' only when Revenue and Gross Profit are available
historical_df['Cost of Revenue'] = historical_df.apply(
    lambda row: row['Revenue'] - row['Gross Profit']
    if pd.isna(row['Cost of Revenue']) and pd.notna(row['Revenue']) and pd.notna(row['Gross Profit'])
    else row['Cost of Revenue'],
    axis=1
)

In [73]:
print(missing_values_check(historical_df))

            Column  No. of Missing Values  Percentage
3  Cost of Revenue                    261   28.431373
4     Gross Profit                    261   28.431373
2          Revenue                    185   20.152505
0            Stock                      0    0.000000
1             Year                      0    0.000000
5  is_out_of_range                      0    0.000000


#### Revenue = Cost of Revenue + Gross Profit

In [74]:
# Fill NaN in 'Revenue' only when Cost of Revenue and Gross Profit are available
historical_df['Revenue'] = historical_df.apply(
    lambda row: row['Cost of Revenue'] + row['Gross Profit']
    if pd.isna(row['Revenue']) and pd.notna(row['Cost of Revenue']) and pd.notna(row['Gross Profit'])
    else row['Revenue'],
    axis=1
)

In [46]:
print(missing_values_check(historical_df))

            Column  No. of Missing Values  Percentage
3  Cost of Revenue                    261   28.431373
4     Gross Profit                    261   28.431373
2          Revenue                    185   20.152505
0            Stock                      0    0.000000
1             Year                      0    0.000000
5  is_out_of_range                      0    0.000000


In [75]:
# Check the shape of Historical Dataset
historical_df.shape

(918, 6)

In [76]:
# Remove rows with no Revenue
historical_df = historical_df.drop(historical_df[historical_df['Revenue'].isna()].index)

#### Drop helper columns

In [77]:
expanded_df = expanded_df.drop(columns=['is_out_of_range'])

In [78]:
partial_df = partial_df.drop(columns=['is_out_of_range'])

In [79]:
historical_df = historical_df.drop(columns=['is_out_of_range'])

In [80]:
sparse_df = sparse_df.drop(columns=['is_out_of_range'])

In [81]:
print(missing_values_check(sparse_df))

    Column  No. of Missing Values  Percentage
0    Stock                      0         0.0
1     Year                      0         0.0
2  Revenue                      0         0.0


### Checking Actual vs Expected value of columns

For each financial metric, the actual values might not follow the expected values.

For example, if EBITDA = Operating Income - Depreciation & Amortization, the actual value might not follow this formula. 

This could be because there are adjustments in real-world reporting or human error from inputting the incorrect value, which can result in them to be there by default.

Depending on how many rows are mismatched for each financial metric, we will decide whether to retain these mismatched rows for analysis later on.

#### Checking EBITDA (Operating Income - D&A)

In [19]:
def ebitda_calculation(df):
    # Calculate expected EBITDA
    df['expected_EBITDA'] = df['Operating Income'] + df['Depreciation & Amortization']
    # Compare expected EBITDA with the actual EBITDA, also considering small floating-point differences
    df['EBITDA_match'] = np.isclose(df['expected_EBITDA'], df['EBITDA'])

In [20]:
ebitda_calculation(expanded_df)

In [21]:
ebitda_calculation(partial_df)

In [2]:
# 181 out of 100213 rows in Expanded Dataset still do not match the expected EBITDA formula.
expanded_df[expanded_df['EBITDA_match'] == False]

NameError: name 'expanded_df' is not defined

In [91]:
# 79 out of 8093 rows in Partial Dataset still do not match the expected EBITDA formula
partial_df[partial_df['EBITDA_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,Cost of Revenue,EBITDA,Interest Expense,Depreciation & Amortization,expected_EBITDA,EBITDA_match
216,ARGX,2022,4.107460e+08,0.000000,-7.196630e+08,-7.095940e+08,-1.727574,3.813150e+08,0.928347,1.100978e+09,2.943100e+07,-5.820093e+08,3.906000e+06,1.043420e+08,-6.153210e+08,False
217,ARGX,2023,1.226316e+09,1.985582,-4.250490e+08,-2.950530e+08,-0.240601,1.108481e+09,0.903911,1.533530e+09,1.178350e+08,-2.160230e+08,9.040000e+05,1.114794e+08,-3.135696e+08,False
232,ASML,2022,2.117340e+10,0.000000,7.321000e+09,5.624200e+09,0.265626,1.051270e+10,0.496505,3.191700e+09,1.066070e+10,6.997600e+09,6.080000e+07,6.407000e+08,7.961700e+09,False
276,AVGOP,2022,3.320300e+10,0.000000,1.422500e+10,1.149500e+10,0.346204,2.209500e+10,0.665452,7.813000e+09,1.110800e+10,1.915500e+10,1.737000e+09,4.984000e+09,1.920900e+10,False
277,AVGOP,2023,3.581900e+10,0.078788,1.620700e+10,1.458500e+10,0.393143,2.469000e+10,0.689299,8.483000e+09,1.112900e+10,2.055400e+10,1.622000e+09,3.835000e+09,2.004200e+10,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6468,URI,2022,1.164200e+10,0.000000,3.232000e+09,2.787000e+09,0.180811,4.996000e+09,0.429136,1.764000e+09,6.646000e+09,5.464000e+09,4.450000e+08,2.217000e+09,5.449000e+09,False
6469,URI,2023,1.433200e+10,0.231060,3.827000e+09,2.424000e+09,0.169132,5.813000e+09,0.405596,2.094000e+09,8.519000e+09,6.627000e+09,6.350000e+08,2.781000e+09,6.608000e+09,False
6478,USB-PA,2023,2.801300e+10,0.000000,7.846000e+09,5.429000e+09,0.193803,2.801300e+10,1.000000,2.801300e+10,0.000000e+00,7.855000e+09,1.261100e+10,1.018000e+09,8.864000e+09,False
6899,WFC-PL,2022,7.378500e+10,0.000000,1.497000e+10,1.367700e+10,0.185363,7.378500e+10,1.000000,5.881500e+10,0.000000e+00,2.227800e+10,9.074000e+09,7.309000e+09,2.227900e+10,False


#### Checking Net Income (Operating Income - Interest Expense)

In [92]:
def netincome_calculation(df):
    # Calculate expected net income
    df['expected_netincome'] = df['Operating Income'] - df['Interest Expense']
    # Compare expected net income with the actual net income, also considering small floating-point differences
    df['netincome_match'] = np.isclose(df['expected_netincome'], df['Net Income'])

In [93]:
netincome_calculation(expanded_df)

In [122]:
expanded_df.columns

Index(['Stock', 'Year', 'Revenue', 'Revenue Growth', 'Operating Income',
       'Net Income', 'Net Income Ratio', 'Gross Profit', 'Gross Profit Ratio',
       'Operating Expenses', 'Cost of Revenue', 'EBITDA', 'Interest Expense',
       'Depreciation & Amortization', 'expected_EBITDA', 'EBITDA_match',
       'expected_netincome', 'netincome_match', 'expected_netincome_ratio',
       'netincomeratio_match', 'expected_grossprofit_ratio',
       'grossprofitratio_match', 'expected_grossprofit', 'grossprofit_match',
       'expected_revenue', 'revenue_match', 'expected_revenuegrowth',
       'revenuegrowth_match', 'expected_costofrevenue', 'costofrevenue_match',
       'expected_operatingincome', 'operatingincome_match',
       'expected_operatingexpenses', 'operatingexpenses_match',
       'expected_depreciationamortization', 'depreciationamortization_match'],
      dtype='object')

In [94]:
netincome_calculation(partial_df)

In [97]:
# 213 out of 100213 rows in Expanded Dataset still do not match the expected Net Income formula
expanded_df[expanded_df['netincome_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,Cost of Revenue,EBITDA,Interest Expense,Depreciation & Amortization,expected_EBITDA,EBITDA_match,expected_netincome,netincome_match
2699,ARGX,2019,7.811609e+07,-0.469980,-1.998759e+08,-1.824254e+08,-2.335311,-1.185681e+08,-1.517845,2.792189e+08,196684207.0,-1.776956e+08,1.388070e+05,2.424651e+06,-1.974513e+08,False,-2.000147e+08,False
2700,ARGX,2020,4.484817e+07,2.949865,-5.143769e+08,-6.512349e+08,-14.520879,-2.590780e+08,-5.776780,5.623561e+08,303926197.0,-6.016670e+08,1.740983e+06,4.221946e+06,-5.101549e+08,False,-5.161178e+08,False
2701,ARGX,2021,4.972770e+08,0.120119,-3.487460e+08,-4.082650e+08,-0.821001,-1.276787e+07,-0.025676,8.460230e+08,510044872.0,-3.144211e+08,1.096000e+06,5.867000e+06,-3.428790e+08,False,-3.498420e+08,False
2702,ARGX,2022,4.107460e+08,0.138077,-7.196630e+08,-7.095940e+08,-1.727574,3.813150e+08,0.928347,1.100978e+09,29431000.0,-5.820093e+08,3.906000e+06,1.043420e+08,-6.153210e+08,False,-7.235690e+08,False
2703,ARGX,2023,1.226316e+09,0.000000,-4.250490e+08,-2.950530e+08,-0.240601,1.108481e+09,0.903911,1.533530e+09,117835000.0,-2.160230e+08,9.040000e+05,1.114794e+08,-3.135696e+08,False,-4.259530e+08,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85312,WFC-PL,2019,8.506300e+10,-0.441807,2.596700e+10,1.971500e+10,0.231769,8.506300e+10,1.000000,5.826000e+10,0.0,0.000000e+00,1.885200e+10,7.923000e+09,3.389000e+10,False,7.115000e+09,False
85313,WFC-PL,2020,7.234000e+10,-0.666893,4.041000e+09,1.786000e+09,0.024689,7.234000e+10,1.000000,5.766700e+10,0.0,0.000000e+00,7.963000e+09,8.974000e+09,1.301500e+10,False,-3.922000e+09,False
85314,WFC-PL,2021,7.849200e+10,-0.496299,2.885100e+10,2.210900e+10,0.281672,7.849200e+10,1.000000,5.379000e+10,0.0,3.731000e+10,3.915000e+09,8.494000e+09,3.734500e+10,False,2.493600e+10,False
85315,WFC-PL,2022,7.378500e+10,-0.178047,1.497000e+10,1.367700e+10,0.185363,7.378500e+10,1.000000,5.727800e+10,0.0,2.227800e+10,9.074000e+09,7.309000e+09,2.227900e+10,False,5.896000e+09,False


In [99]:
# 81 out of 8093 rows in Partial Dataset still do not match the expected Net Income formula
partial_df[partial_df['netincome_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,Cost of Revenue,EBITDA,Interest Expense,Depreciation & Amortization,expected_EBITDA,EBITDA_match,expected_netincome,netincome_match
216,ARGX,2022,4.107460e+08,0.000000,-7.196630e+08,-7.095940e+08,-1.727574,3.813150e+08,0.928347,1.100978e+09,2.943100e+07,-5.820093e+08,3.906000e+06,1.043420e+08,-6.153210e+08,False,-7.235690e+08,False
217,ARGX,2023,1.226316e+09,1.985582,-4.250490e+08,-2.950530e+08,-0.240601,1.108481e+09,0.903911,1.533530e+09,1.178350e+08,-2.160230e+08,9.040000e+05,1.114794e+08,-3.135696e+08,False,-4.259530e+08,False
232,ASML,2022,2.117340e+10,0.000000,7.321000e+09,5.624200e+09,0.265626,1.051270e+10,0.496505,3.191700e+09,1.066070e+10,6.997600e+09,6.080000e+07,6.407000e+08,7.961700e+09,False,7.260200e+09,False
233,ASML,2023,2.755850e+10,0.301562,9.042300e+09,7.839000e+09,0.284449,1.413610e+10,0.512949,5.093800e+09,1.342240e+10,9.698700e+09,1.527000e+08,6.564000e+08,9.698700e+09,True,8.889600e+09,False
276,AVGOP,2022,3.320300e+10,0.000000,1.422500e+10,1.149500e+10,0.346204,2.209500e+10,0.665452,7.813000e+09,1.110800e+10,1.915500e+10,1.737000e+09,4.984000e+09,1.920900e+10,False,1.248800e+10,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6211,TYL,2023,1.951751e+09,0.054884,2.185370e+08,1.659190e+08,0.085010,7.864670e+08,0.402955,5.679300e+08,1.165284e+09,3.926320e+08,2.362900e+07,1.707670e+08,3.893040e+08,False,1.949080e+08,False
6469,URI,2023,1.433200e+10,0.231060,3.827000e+09,2.424000e+09,0.169132,5.813000e+09,0.405596,2.094000e+09,8.519000e+09,6.627000e+09,6.350000e+08,2.781000e+09,6.608000e+09,False,3.192000e+09,False
6478,USB-PA,2023,2.801300e+10,0.000000,7.846000e+09,5.429000e+09,0.193803,2.801300e+10,1.000000,2.801300e+10,0.000000e+00,7.855000e+09,1.261100e+10,1.018000e+09,8.864000e+09,False,-4.765000e+09,False
6899,WFC-PL,2022,7.378500e+10,0.000000,1.497000e+10,1.367700e+10,0.185363,7.378500e+10,1.000000,5.881500e+10,0.000000e+00,2.227800e+10,9.074000e+09,7.309000e+09,2.227900e+10,False,5.896000e+09,False


#### Checking Gross Profit (Revenue - Cost of Revenue)

In [115]:
def grossprofit_calculation(df):
    # Calculate expected gross profit
    df['expected_grossprofit'] = df['Revenue'] - df['Cost of Revenue']
    
    # Compare expected gross profit with the actual gross profit, considering small floating-point differences
    df['grossprofit_match'] = np.isclose(df['expected_grossprofit'], df['Gross Profit'])

In [116]:
grossprofit_calculation(expanded_df)

In [117]:
grossprofit_calculation(partial_df)

In [118]:
# 4 out of 100213 rows in Expanded Dataset still do not match the expected Gross Profit formula
expanded_df[expanded_df['grossprofit_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_EBITDA,EBITDA_match,expected_netincome,netincome_match,expected_netincome_ratio,netincomeratio_match,expected_grossprofit_ratio,grossprofitratio_match,expected_grossprofit,grossprofit_match
4273,BAC-PL,2019,85582000000.0,-0.764196,38840000000.0,27430000000.0,0.320511,0.0,0.0,46742000000.0,...,40569000000.0,False,16495000000.0,False,0.320511,True,0.0,True,85582000000.0,False
4274,BAC-PL,2020,74208000000.0,9.220948,25725000000.0,17894000000.0,0.241133,0.0,0.0,48483000000.0,...,27568000000.0,False,17500000000.0,False,0.241133,True,0.0,True,74208000000.0,False
4275,BAC-PL,2021,93707000000.0,-0.894823,40946000000.0,31978000000.0,0.341255,0.0,0.0,52761000000.0,...,42844000000.0,False,36208000000.0,False,0.341255,True,0.0,True,93707000000.0,False
4276,BAC-PL,2022,92407000000.0,0.243336,38643000000.0,27528000000.0,0.2979,0.0,0.0,53764000000.0,...,40621000000.0,False,18540000000.0,False,0.2979,True,0.0,True,92407000000.0,False


In [119]:
# Only 1 row in Partial Dataset does not match
partial_df[partial_df['grossprofit_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_EBITDA,EBITDA_match,expected_netincome,netincome_match,expected_netincome_ratio,netincomeratio_match,expected_grossprofit_ratio,grossprofitratio_match,expected_grossprofit,grossprofit_match
350,BAC-PL,2022,92407000000.0,0.0,38643000000.0,18540000000.0,0.2979,0.0,0.0,-38643000000.0,...,40621000000.0,False,18540000000.0,True,0.200634,False,0.0,True,92407000000.0,False


#### Checking Net Income Ratio (Net Income / Revenue)

In [14]:
def netincomeratio_calculation(df):
    # Calculate expected net income ratio
    df['expected_netincome_ratio'] = df['Net Income'] / df['Revenue']
    
    # Handle the case where Revenue is zero to avoid division by zero
    df['expected_netincome_ratio'] = df['expected_netincome_ratio'].where(df['Revenue'] != 0, 0)
    
    # Compare expected net income ratio with the actual net income ratio, considering small floating-point differences
    df['netincomeratio_match'] = np.isclose(df['expected_netincome_ratio'], df['Net Income Ratio'])

In [15]:
netincomeratio_calculation(expanded_df)

In [16]:
netincomeratio_calculation(partial_df)

In [17]:
# All rows in Expanded Dataset match
expanded_df[expanded_df['netincomeratio_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_netincome,netincome_match,expected_netincome_ratio,netincomeratio_match,expected_grossprofit_ratio,grossprofitratio_match,expected_grossprofit,grossprofit_match,expected_operatingincome,operatingincome_match


In [18]:
# 10 out of 8093 rows in Partial Dataset still do not match the expected Net Income Ratio formula
partial_df[partial_df['netincomeratio_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_netincome,netincome_match,expected_netincome_ratio,netincomeratio_match,expected_grossprofit_ratio,grossprofitratio_match,expected_grossprofit,grossprofit_match,expected_operatingincome,operatingincome_match
277,AVGOP,2023,35819000000.0,0.078788,16207000000.0,14585000000.0,0.393143,24690000000.0,0.689299,8483000000.0,...,14585000000.0,True,0.407186,False,0.689299,True,24690000000.0,True,16207000000.0,True
315,AXON,2023,1563391000.0,0.313846,154789000.0,147794000.0,0.111442,955382000.0,0.611096,800593000.0,...,147794000.0,True,0.094534,False,0.611096,True,955382000.0,True,154789000.0,True
341,AZO,2023,17457210000.0,0.074142,3473986000.0,3167614000.0,0.144836,9070422000.0,0.51958,5596436000.0,...,3167614000.0,True,0.18145,False,0.51958,True,9070422000.0,True,3473986000.0,True
350,BAC-PL,2022,92407000000.0,0.0,38643000000.0,18540000000.0,0.2979,0.0,0.0,-38643000000.0,...,18540000000.0,True,0.200634,False,0.0,True,92407000000.0,False,38643000000.0,True
591,BRK-A,2023,364482000000.0,0.556352,123196000000.0,118193000000.0,0.263999,123196000000.0,0.338003,198575000000.0,...,118193000000.0,True,0.324277,False,0.338003,True,123196000000.0,True,-75379000000.0,False
1080,DHR-PA,2022,31471000000.0,0.0,8688000000.0,8477000000.0,0.229068,18949000000.0,0.60211,10261000000.0,...,8477000000.0,True,0.269359,False,0.60211,True,18949000000.0,True,8688000000.0,True
1704,FICO,2022,1377270000.0,0.0,542414000.0,473447000.0,0.271218,1075096000.0,0.780599,532682000.0,...,473447000.0,True,0.343758,False,0.780599,True,1075096000.0,True,542414000.0,True
5340,REGN,2022,12172900000.0,0.0,4738900000.0,4679500000.0,0.356398,10612500000.0,0.871814,5618500000.0,...,4679500000.0,True,0.384419,False,0.871814,True,10612500000.0,True,4994000000.0,False
6097,TPL,2023,631595000.0,-0.05368,486053000.0,486053000.0,0.642255,583272000.0,0.923491,97219000.0,...,486053000.0,True,0.769564,False,0.923491,True,583272000.0,True,486053000.0,True
6468,URI,2022,11642000000.0,0.0,3232000000.0,2787000000.0,0.180811,4996000000.0,0.429136,1764000000.0,...,2787000000.0,True,0.239392,False,0.429136,True,4996000000.0,True,3232000000.0,True


#### Checking Gross Profit Ratio (Gross Profit / Revenue)

In [108]:
def grossprofitratio_calculation(df):
    # Calculate expected gross profit ratio
    df['expected_grossprofit_ratio'] = df['Gross Profit'] / df['Revenue']
    
    # Handle the case where Revenue is zero to avoid division by zero
    df['expected_grossprofit_ratio'] = df['expected_grossprofit_ratio'].where(df['Revenue'] != 0, 0)
    
    # Compare expected gross profit ratio with the actual gross profit ratio, considering small floating-point differences
    df['grossprofitratio_match'] = np.isclose(df['expected_grossprofit_ratio'], df['Gross Profit Ratio'])

In [109]:
grossprofitratio_calculation(expanded_df)

In [110]:
grossprofitratio_calculation(partial_df)

In [111]:
# All rows in Expanded Dataset match
expanded_df[expanded_df['grossprofitratio_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,Interest Expense,Depreciation & Amortization,expected_EBITDA,EBITDA_match,expected_netincome,netincome_match,expected_netincome_ratio,netincomeratio_match,expected_grossprofit_ratio,grossprofitratio_match


In [113]:
# All rows in Partial Dataset match
partial_df[partial_df['grossprofitratio_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,Interest Expense,Depreciation & Amortization,expected_EBITDA,EBITDA_match,expected_netincome,netincome_match,expected_netincome_ratio,netincomeratio_match,expected_grossprofit_ratio,grossprofitratio_match


#### Checking Revenue (Gross Profit + Cost of Revenue)

In [9]:
def revenue_calculation(df):
    # Calculate expected revenue
    df['expected_revenue'] = df['Gross Profit'] + df['Cost of Revenue']

    # Compare expected revenue with the actual revenue, considering small floating-point differences
    df['revenue_match'] = np.isclose(df['expected_revenue'], df['Revenue'])

In [10]:
revenue_calculation(expanded_df)

In [11]:
revenue_calculation(partial_df)

In [19]:
revenue_calculation(historical_df)

In [14]:
# 4 out of 100213 rows in Expanded Dataset still do not match
expanded_df[expanded_df['revenue_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_netincome,netincome_match,expected_netincome_ratio,netincomeratio_match,expected_grossprofit_ratio,grossprofitratio_match,expected_grossprofit,grossprofit_match,expected_revenue,revenue_match
4273,BAC-PL,2019,85582000000.0,-0.764196,38840000000.0,27430000000.0,0.320511,0.0,0.0,46742000000.0,...,16495000000.0,False,0.320511,True,0.0,True,85582000000.0,False,0.0,False
4274,BAC-PL,2020,74208000000.0,9.220948,25725000000.0,17894000000.0,0.241133,0.0,0.0,48483000000.0,...,17500000000.0,False,0.241133,True,0.0,True,74208000000.0,False,0.0,False
4275,BAC-PL,2021,93707000000.0,-0.894823,40946000000.0,31978000000.0,0.341255,0.0,0.0,52761000000.0,...,36208000000.0,False,0.341255,True,0.0,True,93707000000.0,False,0.0,False
4276,BAC-PL,2022,92407000000.0,0.243336,38643000000.0,27528000000.0,0.2979,0.0,0.0,53764000000.0,...,18540000000.0,False,0.2979,True,0.0,True,92407000000.0,False,0.0,False


In [15]:
# 1 out of 8093 rows in Partial Dataset still does not match
partial_df[partial_df['revenue_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_netincome,netincome_match,expected_netincome_ratio,netincomeratio_match,expected_grossprofit_ratio,grossprofitratio_match,expected_grossprofit,grossprofit_match,expected_revenue,revenue_match
350,BAC-PL,2022,92407000000.0,0.0,38643000000.0,18540000000.0,0.2979,0.0,0.0,-38643000000.0,...,18540000000.0,True,0.200634,False,0.0,True,92407000000.0,False,0.0,False


In [23]:
false_rows = historical_df[historical_df['revenue_match'] == False]

all_nan = false_rows[['Cost of Revenue', 'Gross Profit']].isna().all().all()

In [25]:
# 76 out of 733 rows in Historical Dataset still do not match
# But we keep the 76 rows anyway because the mismatch is actually caused by NaN in the other columns
if all_nan:
    print("All rows of Cost of Revenue & Gross Profit are NaN")
else:
    print("Not all rows are NaN")

All rows of Cost of Revenue & Gross Profit are NaN


#### Checking Revenue Growth (% Change in Revenue from Previous Year)

In [62]:
def revenuegrowth_calculation(df):
    # Sort Stock and Year first
    df = df.sort_values(by=['Stock', 'Year'])
    
    # Calculate expected revenue growth
    df['expected_revenuegrowth'] = df.groupby('Stock')['Revenue'].pct_change()

    # Fill NaN values in expected_revenuegrowth (e.g., first year or missing data)
    df['expected_revenuegrowth'] = df['expected_revenuegrowth'].fillna(0)

    # Compare expected revenue growth with the actual revenue growth, considering small floating-point differences
    df['revenuegrowth_match'] = np.isclose(df['expected_revenuegrowth'], df['Revenue Growth'], atol=1e-2)

    return df

In [63]:
expanded_df = revenuegrowth_calculation(expanded_df)

In [64]:
partial_df = revenuegrowth_calculation(partial_df)

In [67]:
# Almost all rows in Expanded Dataset still do not match
expanded_df[expanded_df['revenuegrowth_match'] == False]

# We keep these rows but replace the original Revenue Growth with the expected numbers
expanded_df['Revenue Growth'] = expanded_df['expected_revenuegrowth']

In [70]:
# All rows in Partial Dataset match
partial_df[partial_df['revenuegrowth_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_netincome_ratio,netincomeratio_match,expected_grossprofit_ratio,grossprofitratio_match,expected_grossprofit,grossprofit_match,expected_revenue,revenue_match,expected_revenuegrowth,revenuegrowth_match


#### Checking Cost of Revenue (Revenue - Gross Profit)

In [71]:
def costofrevenue_calculation(df):
    # Calculate expected cost of revenue
    df['expected_costofrevenue'] = df['Revenue'] - df['Gross Profit']

    # Compare expected cost of revenue with the actual cost of revenue, considering small floating-point differences
    df['costofrevenue_match'] = np.isclose(df['expected_costofrevenue'], df['Cost of Revenue'])

In [72]:
costofrevenue_calculation(expanded_df)

In [73]:
costofrevenue_calculation(partial_df)

In [75]:
# 4 out of 100213 rows in Expanded Dataset still do not match
expanded_df[expanded_df['costofrevenue_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_grossprofit_ratio,grossprofitratio_match,expected_grossprofit,grossprofit_match,expected_revenue,revenue_match,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match
4273,BAC-PL,2019,85582000000.0,0.0,38840000000.0,27430000000.0,0.320511,0.0,0.0,46742000000.0,...,0.0,True,85582000000.0,False,0.0,False,0.0,False,85582000000.0,False
4274,BAC-PL,2020,74208000000.0,-0.132902,25725000000.0,17894000000.0,0.241133,0.0,0.0,48483000000.0,...,0.0,True,74208000000.0,False,0.0,False,-0.132902,False,74208000000.0,False
4275,BAC-PL,2021,93707000000.0,0.262761,40946000000.0,31978000000.0,0.341255,0.0,0.0,52761000000.0,...,0.0,True,93707000000.0,False,0.0,False,0.262761,False,93707000000.0,False
4276,BAC-PL,2022,92407000000.0,-0.013873,38643000000.0,27528000000.0,0.2979,0.0,0.0,53764000000.0,...,0.0,True,92407000000.0,False,0.0,False,-0.013873,False,92407000000.0,False


In [74]:
# 1 out of 8093 rows in Partial Dataset does not match
partial_df[partial_df['costofrevenue_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_grossprofit_ratio,grossprofitratio_match,expected_grossprofit,grossprofit_match,expected_revenue,revenue_match,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match
350,BAC-PL,2022,92407000000.0,0.0,38643000000.0,18540000000.0,0.2979,0.0,0.0,-38643000000.0,...,0.0,True,92407000000.0,False,0.0,False,0.0,True,92407000000.0,False


#### Checking Operating Income (Gross Profit - Operating Expense)

In [76]:
def operatingincome_calculation(df):
    # Calculate expected operating income
    df['expected_operatingincome'] = df['Gross Profit'] - df['Operating Expenses']
    
    # Compare expected operating income with the actual operating income, considering small floating-point differences
    df['operatingincome_match'] = np.isclose(df['expected_operatingincome'], df['Operating Income'])

In [77]:
operatingincome_calculation(expanded_df)

In [78]:
operatingincome_calculation(partial_df)

In [79]:
# 99 out of 100213 rows in Expanded Dataset still do not match
expanded_df[expanded_df['operatingincome_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_grossprofit,grossprofit_match,expected_revenue,revenue_match,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match,expected_operatingincome,operatingincome_match
2699,ARGX,2019,7.811609e+07,0.000000,-1.998759e+08,-1.824254e+08,-2.335311,-1.185681e+08,-1.517845,2.792189e+08,...,-1.185681e+08,True,7.811609e+07,True,0.000000,False,1.966842e+08,True,-3.977870e+08,False
2700,ARGX,2020,4.484817e+07,-0.425878,-5.143769e+08,-6.512349e+08,-14.520879,-2.590780e+08,-5.776780,5.623561e+08,...,-2.590780e+08,True,4.484817e+07,True,-0.425878,False,3.039262e+08,True,-8.214341e+08,False
2701,ARGX,2021,4.972770e+08,10.088010,-3.487460e+08,-4.082650e+08,-0.821001,-1.276787e+07,-0.025676,8.460230e+08,...,-1.276787e+07,True,4.972770e+08,True,10.088010,False,5.100449e+08,True,-8.587909e+08,False
2881,ASML,2021,1.861100e+10,0.331402,6.750100e+09,5.883200e+09,0.316114,9.809000e+09,0.527054,3.272600e+09,...,9.809000e+09,True,1.861100e+10,True,0.331402,False,8.802000e+09,True,6.536400e+09,False
3409,AVGOP,2020,2.388800e+10,0.000000,4.014000e+09,2.960000e+09,0.123912,1.351600e+10,0.565807,9.304000e+09,...,1.351600e+10,True,2.388800e+10,True,0.000000,False,1.037200e+10,True,4.212000e+09,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85312,WFC-PL,2019,8.506300e+10,0.000000,2.596700e+10,1.971500e+10,0.231769,8.506300e+10,1.000000,5.826000e+10,...,8.506300e+10,True,8.506300e+10,True,0.000000,False,0.000000e+00,True,2.680300e+10,False
85313,WFC-PL,2020,7.234000e+10,-0.149571,4.041000e+09,1.786000e+09,0.024689,7.234000e+10,1.000000,5.766700e+10,...,7.234000e+10,True,7.234000e+10,True,-0.149571,False,0.000000e+00,True,1.467300e+10,False
85314,WFC-PL,2021,7.849200e+10,0.085043,2.885100e+10,2.210900e+10,0.281672,7.849200e+10,1.000000,5.379000e+10,...,7.849200e+10,True,7.849200e+10,True,0.085043,False,0.000000e+00,True,2.470200e+10,False
85315,WFC-PL,2022,7.378500e+10,-0.059968,1.497000e+10,1.367700e+10,0.185363,7.378500e+10,1.000000,5.727800e+10,...,7.378500e+10,True,7.378500e+10,True,-0.059968,False,0.000000e+00,True,1.650700e+10,False


In [13]:
# 23 out of 8093 rows in Partial Dataset still do not match
partial_df[partial_df['operatingincome_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_netincome,netincome_match,expected_netincome_ratio,netincomeratio_match,expected_grossprofit_ratio,grossprofitratio_match,expected_grossprofit,grossprofit_match,expected_operatingincome,operatingincome_match
276,AVGOP,2022,33203000000.0,0.0,14225000000.0,11495000000.0,0.346204,22095000000.0,0.665452,7813000000.0,...,12488000000.0,False,0.346204,True,0.665452,True,22095000000.0,True,14282000000.0,False
591,BRK-A,2023,364482000000.0,0.556352,123196000000.0,118193000000.0,0.263999,123196000000.0,0.338003,198575000000.0,...,118193000000.0,True,0.324277,False,0.338003,True,123196000000.0,True,-75379000000.0,False
1092,DJCO,2022,54009000.0,0.0,7440000.0,-75624000.0,-1.400211,11666000.0,0.216001,8432000.0,...,6331000.0,False,-1.400211,True,0.216001,True,11666000.0,True,3234000.0,False
1476,EQIX,2022,7263105000.0,0.0,1200528000.0,704345000.0,0.096976,3511604000.0,0.483485,2285261000.0,...,844191000.0,False,0.096976,True,0.483485,True,3511604000.0,True,1226343000.0,False
1625,FCNCA,2023,7604000000.0,0.674521,3331000000.0,11466000000.0,1.507891,4080000000.0,0.53656,4273000000.0,...,-348000000.0,False,1.507891,True,0.53656,True,4080000000.0,True,-193000000.0,False
1852,FTV-PA,2022,5825700000.0,0.0,987400000.0,755200000.0,0.129632,3363400000.0,0.577338,2358100000.0,...,889100000.0,False,0.129632,True,0.577338,True,3363400000.0,True,1005300000.0,False
2040,GHC,2022,3924493000.0,0.0,444716000.0,67079000.0,0.017092,1266775000.0,0.322787,3711605000.0,...,390313000.0,False,0.017092,True,0.322787,True,1266775000.0,True,-2444830000.0,False
2152,GS,2022,47365000000.0,0.0,34832000000.0,11261000000.0,0.237749,31191000000.0,0.658524,31164000000.0,...,13486000000.0,False,0.237749,True,0.658524,True,31191000000.0,True,27000000.0,False
2153,GS,2023,46254000000.0,-0.023456,26422000000.0,8516000000.0,0.184114,29702000000.0,0.64215,46254000000.0,...,-35742000000.0,False,0.184114,True,0.64215,True,29702000000.0,True,-16552000000.0,False
2467,HUBS,2023,2170230000.0,0.253766,-208056000.0,-176295000.0,-0.081233,1824741000.0,0.840805,1935954000.0,...,-211857000.0,False,-0.081233,True,0.840805,True,1824741000.0,True,-111213000.0,False


#### Checking Operating Expenses (Gross Profit - Operating Income)

In [80]:
def operatingexpenses_calculation(df):
    # Calculate expected operating expenses
    df['expected_operatingexpenses'] = df['Gross Profit'] - df['Operating Income']
    
    # Compare expected operating expenses with the actual operating expense, considering small floating-point differences
    df['operatingexpenses_match'] = np.isclose(df['expected_operatingexpenses'], df['Operating Expenses'])

In [81]:
operatingexpenses_calculation(expanded_df)

In [82]:
operatingexpenses_calculation(partial_df)

In [83]:
# 99 out of 100213 in Expanded Dataset match
expanded_df[expanded_df['operatingexpenses_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_revenue,revenue_match,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match,expected_operatingincome,operatingincome_match,expected_operatingexpenses,operatingexpenses_match
2699,ARGX,2019,7.811609e+07,0.000000,-1.998759e+08,-1.824254e+08,-2.335311,-1.185681e+08,-1.517845,2.792189e+08,...,7.811609e+07,True,0.000000,False,1.966842e+08,True,-3.977870e+08,False,8.130778e+07,False
2700,ARGX,2020,4.484817e+07,-0.425878,-5.143769e+08,-6.512349e+08,-14.520879,-2.590780e+08,-5.776780,5.623561e+08,...,4.484817e+07,True,-0.425878,False,3.039262e+08,True,-8.214341e+08,False,2.552988e+08,False
2701,ARGX,2021,4.972770e+08,10.088010,-3.487460e+08,-4.082650e+08,-0.821001,-1.276787e+07,-0.025676,8.460230e+08,...,4.972770e+08,True,10.088010,False,5.100449e+08,True,-8.587909e+08,False,3.359781e+08,False
2881,ASML,2021,1.861100e+10,0.331402,6.750100e+09,5.883200e+09,0.316114,9.809000e+09,0.527054,3.272600e+09,...,1.861100e+10,True,0.331402,False,8.802000e+09,True,6.536400e+09,False,3.058900e+09,False
3409,AVGOP,2020,2.388800e+10,0.000000,4.014000e+09,2.960000e+09,0.123912,1.351600e+10,0.565807,9.304000e+09,...,2.388800e+10,True,0.000000,False,1.037200e+10,True,4.212000e+09,False,9.502000e+09,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85312,WFC-PL,2019,8.506300e+10,0.000000,2.596700e+10,1.971500e+10,0.231769,8.506300e+10,1.000000,5.826000e+10,...,8.506300e+10,True,0.000000,False,0.000000e+00,True,2.680300e+10,False,5.909600e+10,False
85313,WFC-PL,2020,7.234000e+10,-0.149571,4.041000e+09,1.786000e+09,0.024689,7.234000e+10,1.000000,5.766700e+10,...,7.234000e+10,True,-0.149571,False,0.000000e+00,True,1.467300e+10,False,6.829900e+10,False
85314,WFC-PL,2021,7.849200e+10,0.085043,2.885100e+10,2.210900e+10,0.281672,7.849200e+10,1.000000,5.379000e+10,...,7.849200e+10,True,0.085043,False,0.000000e+00,True,2.470200e+10,False,4.964100e+10,False
85315,WFC-PL,2022,7.378500e+10,-0.059968,1.497000e+10,1.367700e+10,0.185363,7.378500e+10,1.000000,5.727800e+10,...,7.378500e+10,True,-0.059968,False,0.000000e+00,True,1.650700e+10,False,5.881500e+10,False


In [84]:
# 23 out of 8093 rows in Partial Dataset still do not match
partial_df[partial_df['operatingexpenses_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_revenue,revenue_match,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match,expected_operatingincome,operatingincome_match,expected_operatingexpenses,operatingexpenses_match
276,AVGOP,2022,33203000000.0,0.0,14225000000.0,11495000000.0,0.346204,22095000000.0,0.665452,7813000000.0,...,33203000000.0,True,0.0,True,11108000000.0,True,14282000000.0,False,7870000000.0,False
591,BRK-A,2023,364482000000.0,0.556352,123196000000.0,118193000000.0,0.263999,123196000000.0,0.338003,198575000000.0,...,364482000000.0,True,0.556352,True,241286000000.0,True,-75379000000.0,False,0.0,False
1092,DJCO,2022,54009000.0,0.0,7440000.0,-75624000.0,-1.400211,11666000.0,0.216001,8432000.0,...,54009000.0,True,0.0,True,42343000.0,True,3234000.0,False,4226000.0,False
1476,EQIX,2022,7263105000.0,0.0,1200528000.0,704345000.0,0.096976,3511604000.0,0.483485,2285261000.0,...,7263105000.0,True,0.0,True,3751501000.0,True,1226343000.0,False,2311076000.0,False
1625,FCNCA,2023,7604000000.0,0.674521,3331000000.0,11466000000.0,1.507891,4080000000.0,0.53656,4273000000.0,...,7604000000.0,True,0.674521,True,3524000000.0,True,-193000000.0,False,749000000.0,False
1852,FTV-PA,2022,5825700000.0,0.0,987400000.0,755200000.0,0.129632,3363400000.0,0.577338,2358100000.0,...,5825700000.0,True,0.0,True,2462300000.0,True,1005300000.0,False,2376000000.0,False
2040,GHC,2022,3924493000.0,0.0,444716000.0,67079000.0,0.017092,1266775000.0,0.322787,3711605000.0,...,3924493000.0,True,0.0,True,2657718000.0,True,-2444830000.0,False,822059000.0,False
2152,GS,2022,47365000000.0,0.0,34832000000.0,11261000000.0,0.237749,31191000000.0,0.658524,31164000000.0,...,47365000000.0,True,0.0,True,16174000000.0,True,27000000.0,False,-3641000000.0,False
2153,GS,2023,46254000000.0,-0.023456,26422000000.0,8516000000.0,0.184114,29702000000.0,0.64215,46254000000.0,...,46254000000.0,True,-0.023456,True,16552000000.0,True,-16552000000.0,False,3280000000.0,False
2467,HUBS,2023,2170230000.0,0.253766,-208056000.0,-176295000.0,-0.081233,1824741000.0,0.840805,1935954000.0,...,2170230000.0,True,0.253766,True,345489000.0,True,-111213000.0,False,2032797000.0,False


#### Checking Depreciation & Amortization (EBITDA - Operating Income)

In [90]:
def depreciationamortization_calculation(df):
    # Calculate expected D&A
    df['expected_depreciationamortization'] = df['EBITDA'] - df['Operating Income']
    
    # Compare expected D&A with the actual D&A, considering small floating-point differences
    df['depreciationamortization_match'] = np.isclose(df['expected_depreciationamortization'], df['Depreciation & Amortization'])

In [86]:
depreciationamortization_calculation(expanded_df)

In [87]:
depreciationamortization_calculation(partial_df)

In [91]:
# 181 rows out of 100213 in Expanded Dataset still do not match
expanded_df[expanded_df['depreciationamortization_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match,expected_operatingincome,operatingincome_match,expected_operatingexpenses,operatingexpenses_match,expected_depreciationamortization,depreciationamortization_match
2699,ARGX,2019,7.811609e+07,0.000000,-1.998759e+08,-1.824254e+08,-2.335311,-1.185681e+08,-1.517845,2.792189e+08,...,0.000000,False,196684206.0,True,-3.977870e+08,False,8.130778e+07,False,2.218027e+07,False
2700,ARGX,2020,4.484817e+07,-0.425878,-5.143769e+08,-6.512349e+08,-14.520879,-2.590780e+08,-5.776780,5.623561e+08,...,-0.425878,False,303926196.0,True,-8.214341e+08,False,2.552988e+08,False,-8.729010e+07,False
2701,ARGX,2021,4.972770e+08,10.088010,-3.487460e+08,-4.082650e+08,-0.821001,-1.276787e+07,-0.025676,8.460230e+08,...,10.088010,False,510044872.0,True,-8.587909e+08,False,3.359781e+08,False,3.432493e+07,False
2702,ARGX,2022,4.107460e+08,-0.174010,-7.196630e+08,-7.095940e+08,-1.727574,3.813150e+08,0.928347,1.100978e+09,...,-0.174010,False,29431000.0,True,-7.196630e+08,True,1.100978e+09,True,1.376537e+08,False
2703,ARGX,2023,1.226316e+09,1.985582,-4.250490e+08,-2.950530e+08,-0.240601,1.108481e+09,0.903911,1.533530e+09,...,1.985582,False,117835000.0,True,-4.250490e+08,True,1.533530e+09,True,2.090260e+08,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85312,WFC-PL,2019,8.506300e+10,0.000000,2.596700e+10,1.971500e+10,0.231769,8.506300e+10,1.000000,5.826000e+10,...,0.000000,False,0.0,True,2.680300e+10,False,5.909600e+10,False,-2.596700e+10,False
85313,WFC-PL,2020,7.234000e+10,-0.149571,4.041000e+09,1.786000e+09,0.024689,7.234000e+10,1.000000,5.766700e+10,...,-0.149571,False,0.0,True,1.467300e+10,False,6.829900e+10,False,-4.041000e+09,False
85314,WFC-PL,2021,7.849200e+10,0.085043,2.885100e+10,2.210900e+10,0.281672,7.849200e+10,1.000000,5.379000e+10,...,0.085043,False,0.0,True,2.470200e+10,False,4.964100e+10,False,8.459000e+09,False
85315,WFC-PL,2022,7.378500e+10,-0.059968,1.497000e+10,1.367700e+10,0.185363,7.378500e+10,1.000000,5.727800e+10,...,-0.059968,False,0.0,True,1.650700e+10,False,5.881500e+10,False,7.308000e+09,False


In [92]:
# 79 rows out of 8093 rows in Partial Dataset still do not match
partial_df[partial_df['depreciationamortization_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match,expected_operatingincome,operatingincome_match,expected_operatingexpenses,operatingexpenses_match,expected_depreciationamortization,depreciationamortization_match
216,ARGX,2022,4.107460e+08,0.000000,-7.196630e+08,-7.095940e+08,-1.727574,3.813150e+08,0.928347,1.100978e+09,...,0.000000,True,2.943100e+07,True,-7.196630e+08,True,1.100978e+09,True,1.376537e+08,False
217,ARGX,2023,1.226316e+09,1.985582,-4.250490e+08,-2.950530e+08,-0.240601,1.108481e+09,0.903911,1.533530e+09,...,1.985582,True,1.178350e+08,True,-4.250490e+08,True,1.533530e+09,True,2.090260e+08,False
232,ASML,2022,2.117340e+10,0.000000,7.321000e+09,5.624200e+09,0.265626,1.051270e+10,0.496505,3.191700e+09,...,0.000000,True,1.066070e+10,True,7.321000e+09,True,3.191700e+09,True,-3.234000e+08,False
276,AVGOP,2022,3.320300e+10,0.000000,1.422500e+10,1.149500e+10,0.346204,2.209500e+10,0.665452,7.813000e+09,...,0.000000,True,1.110800e+10,True,1.428200e+10,False,7.870000e+09,False,4.930000e+09,False
277,AVGOP,2023,3.581900e+10,0.078788,1.620700e+10,1.458500e+10,0.393143,2.469000e+10,0.689299,8.483000e+09,...,0.078788,True,1.112900e+10,True,1.620700e+10,True,8.483000e+09,True,4.347000e+09,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6468,URI,2022,1.164200e+10,0.000000,3.232000e+09,2.787000e+09,0.180811,4.996000e+09,0.429136,1.764000e+09,...,0.000000,True,6.646000e+09,True,3.232000e+09,True,1.764000e+09,True,2.232000e+09,False
6469,URI,2023,1.433200e+10,0.231060,3.827000e+09,2.424000e+09,0.169132,5.813000e+09,0.405596,2.094000e+09,...,0.231060,True,8.519000e+09,True,3.719000e+09,False,1.986000e+09,False,2.800000e+09,False
6478,USB-PA,2023,2.801300e+10,0.000000,7.846000e+09,5.429000e+09,0.193803,2.801300e+10,1.000000,2.801300e+10,...,0.000000,True,0.000000e+00,True,0.000000e+00,False,2.016700e+10,False,9.000000e+06,False
6899,WFC-PL,2022,7.378500e+10,0.000000,1.497000e+10,1.367700e+10,0.185363,7.378500e+10,1.000000,5.881500e+10,...,0.000000,True,0.000000e+00,True,1.497000e+10,True,5.881500e+10,True,7.308000e+09,False


#### Dropping rows with 'False' in any of Match columns

In [97]:
# Expanded Dataset - All match columns except for expected revenue growth match
expanded_df_match_columns = [col for col in expanded_df.columns if 'match' in col and col != 'revenuegrowth_match']

# Filter rows where any of the match columns have False values - 213 rows
expanded_df_rows_with_false = expanded_df[~expanded_df[expanded_df_match_columns].all(axis=1)]

expanded_df_rows_with_false

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match,expected_operatingincome,operatingincome_match,expected_operatingexpenses,operatingexpenses_match,expected_depreciationamortization,depreciationamortization_match
2699,ARGX,2019,7.811609e+07,0.000000,-1.998759e+08,-1.824254e+08,-2.335311,-1.185681e+08,-1.517845,2.792189e+08,...,0.000000,False,196684206.0,True,-3.977870e+08,False,8.130778e+07,False,2.218027e+07,False
2700,ARGX,2020,4.484817e+07,-0.425878,-5.143769e+08,-6.512349e+08,-14.520879,-2.590780e+08,-5.776780,5.623561e+08,...,-0.425878,False,303926196.0,True,-8.214341e+08,False,2.552988e+08,False,-8.729010e+07,False
2701,ARGX,2021,4.972770e+08,10.088010,-3.487460e+08,-4.082650e+08,-0.821001,-1.276787e+07,-0.025676,8.460230e+08,...,10.088010,False,510044872.0,True,-8.587909e+08,False,3.359781e+08,False,3.432493e+07,False
2702,ARGX,2022,4.107460e+08,-0.174010,-7.196630e+08,-7.095940e+08,-1.727574,3.813150e+08,0.928347,1.100978e+09,...,-0.174010,False,29431000.0,True,-7.196630e+08,True,1.100978e+09,True,1.376537e+08,False
2703,ARGX,2023,1.226316e+09,1.985582,-4.250490e+08,-2.950530e+08,-0.240601,1.108481e+09,0.903911,1.533530e+09,...,1.985582,False,117835000.0,True,-4.250490e+08,True,1.533530e+09,True,2.090260e+08,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85312,WFC-PL,2019,8.506300e+10,0.000000,2.596700e+10,1.971500e+10,0.231769,8.506300e+10,1.000000,5.826000e+10,...,0.000000,False,0.0,True,2.680300e+10,False,5.909600e+10,False,-2.596700e+10,False
85313,WFC-PL,2020,7.234000e+10,-0.149571,4.041000e+09,1.786000e+09,0.024689,7.234000e+10,1.000000,5.766700e+10,...,-0.149571,False,0.0,True,1.467300e+10,False,6.829900e+10,False,-4.041000e+09,False
85314,WFC-PL,2021,7.849200e+10,0.085043,2.885100e+10,2.210900e+10,0.281672,7.849200e+10,1.000000,5.379000e+10,...,0.085043,False,0.0,True,2.470200e+10,False,4.964100e+10,False,8.459000e+09,False
85315,WFC-PL,2022,7.378500e+10,-0.059968,1.497000e+10,1.367700e+10,0.185363,7.378500e+10,1.000000,5.727800e+10,...,-0.059968,False,0.0,True,1.650700e+10,False,5.881500e+10,False,7.308000e+09,False


In [98]:
# Drop these rows
expanded_df = expanded_df.drop(index=expanded_df_rows_with_false.index)

In [99]:
expanded_df

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match,expected_operatingincome,operatingincome_match,expected_operatingexpenses,operatingexpenses_match,expected_depreciationamortization,depreciationamortization_match
0,AAFRW,2000,1.103142e+09,0.000000,2.918478e+08,2.915345e+08,0.264276,6.296658e+08,0.570793,3.378181e+08,...,0.000000,False,4.734762e+08,True,2.918478e+08,True,3.378181e+08,True,7821411.0,True
1,AAFRW,2001,3.984834e+07,-0.963877,4.527289e+05,-6.468705e+04,-0.001623,1.213086e+07,0.304426,1.167813e+07,...,-0.963877,False,2.771748e+07,True,4.527289e+05,True,1.167813e+07,True,529931.0,True
2,AAFRW,2002,9.519782e+08,22.890035,1.890946e+08,1.887653e+08,0.198287,3.978879e+08,0.417959,2.087934e+08,...,22.890035,False,5.540903e+08,True,1.890946e+08,True,2.087934e+08,True,4493034.0,True
3,AAFRW,2003,1.283757e+09,0.348515,4.323393e+08,4.314507e+08,0.336084,7.196193e+08,0.560557,2.872800e+08,...,0.348515,False,5.641375e+08,True,4.323393e+08,True,2.872800e+08,True,2228554.0,True
4,AAFRW,2004,1.503795e+08,-0.882860,1.974637e+07,1.931370e+07,0.128433,6.864248e+07,0.456462,4.889611e+07,...,-0.882860,False,8.173707e+07,True,1.974637e+07,True,4.889611e+07,True,5271247.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100208,ZZZBQ,2020,3.781246e+08,1.140794,7.029086e+07,6.974936e+07,0.184461,1.586118e+08,0.419470,8.832091e+07,...,1.140794,False,2.195128e+08,True,7.029086e+07,True,8.832091e+07,True,6679235.0,True
100209,ZZZBQ,2021,1.300227e+09,2.438621,1.191204e+08,1.181734e+08,0.090887,5.562011e+08,0.427772,4.370807e+08,...,2.438621,False,7.440260e+08,True,1.191204e+08,True,4.370807e+08,True,5861457.0,True
100210,ZZZBQ,2022,1.128191e+09,-0.132313,2.433491e+08,2.431163e+08,0.215492,6.427123e+08,0.569684,3.993631e+08,...,-0.132313,False,4.854785e+08,True,2.433491e+08,True,3.993631e+08,True,886906.0,True
100211,ZZZBQ,2023,8.445354e+08,-0.251425,3.927662e+07,3.905789e+07,0.046248,2.832449e+08,0.335386,2.439683e+08,...,-0.251425,False,5.612904e+08,True,3.927662e+07,True,2.439683e+08,True,3587258.0,True


In [100]:
# Partial Dataset - All match columns except for expected revenue growth match
partial_df_match_columns = [col for col in partial_df.columns if 'match' in col and col != 'revenuegrowth_match']

# Filter rows where any of the match columns have False values - 91 rows
partial_df_rows_with_false = partial_df[~partial_df[partial_df_match_columns].all(axis=1)]

partial_df_rows_with_false

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match,expected_operatingincome,operatingincome_match,expected_operatingexpenses,operatingexpenses_match,expected_depreciationamortization,depreciationamortization_match
216,ARGX,2022,4.107460e+08,0.000000,-7.196630e+08,-7.095940e+08,-1.727574,3.813150e+08,0.928347,1.100978e+09,...,0.000000,True,2.943100e+07,True,-7.196630e+08,True,1.100978e+09,True,1.376537e+08,False
217,ARGX,2023,1.226316e+09,1.985582,-4.250490e+08,-2.950530e+08,-0.240601,1.108481e+09,0.903911,1.533530e+09,...,1.985582,True,1.178350e+08,True,-4.250490e+08,True,1.533530e+09,True,2.090260e+08,False
232,ASML,2022,2.117340e+10,0.000000,7.321000e+09,5.624200e+09,0.265626,1.051270e+10,0.496505,3.191700e+09,...,0.000000,True,1.066070e+10,True,7.321000e+09,True,3.191700e+09,True,-3.234000e+08,False
233,ASML,2023,2.755850e+10,0.301562,9.042300e+09,7.839000e+09,0.284449,1.413610e+10,0.512949,5.093800e+09,...,0.301562,True,1.342240e+10,True,9.042300e+09,True,5.093800e+09,True,6.564000e+08,True
276,AVGOP,2022,3.320300e+10,0.000000,1.422500e+10,1.149500e+10,0.346204,2.209500e+10,0.665452,7.813000e+09,...,0.000000,True,1.110800e+10,True,1.428200e+10,False,7.870000e+09,False,4.930000e+09,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6468,URI,2022,1.164200e+10,0.000000,3.232000e+09,2.787000e+09,0.180811,4.996000e+09,0.429136,1.764000e+09,...,0.000000,True,6.646000e+09,True,3.232000e+09,True,1.764000e+09,True,2.232000e+09,False
6469,URI,2023,1.433200e+10,0.231060,3.827000e+09,2.424000e+09,0.169132,5.813000e+09,0.405596,2.094000e+09,...,0.231060,True,8.519000e+09,True,3.719000e+09,False,1.986000e+09,False,2.800000e+09,False
6478,USB-PA,2023,2.801300e+10,0.000000,7.846000e+09,5.429000e+09,0.193803,2.801300e+10,1.000000,2.801300e+10,...,0.000000,True,0.000000e+00,True,0.000000e+00,False,2.016700e+10,False,9.000000e+06,False
6899,WFC-PL,2022,7.378500e+10,0.000000,1.497000e+10,1.367700e+10,0.185363,7.378500e+10,1.000000,5.881500e+10,...,0.000000,True,0.000000e+00,True,1.497000e+10,True,5.881500e+10,True,7.308000e+09,False


In [101]:
# Drop these rows
partial_df = partial_df.drop(index=partial_df_rows_with_false.index)

In [102]:
partial_df

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match,expected_operatingincome,operatingincome_match,expected_operatingexpenses,operatingexpenses_match,expected_depreciationamortization,depreciationamortization_match
0,AAFRW,2022,7.097898e+08,0.000000,4.695101e+07,4.609527e+07,0.064942,2.968603e+08,0.418237,2.499093e+08,...,0.000000,True,4.129296e+08,True,4.695101e+07,True,2.499093e+08,True,8154792.0,True
1,AAFRW,2023,2.947645e+07,-0.958472,8.918110e+06,8.538895e+06,0.289685,1.589946e+07,0.539395,6.981349e+06,...,-0.958472,True,1.357699e+07,True,8.918110e+06,True,6.981349e+06,True,9404991.0,True
2,AAHFBX,2022,1.148219e+09,0.000000,1.709324e+08,1.703448e+08,0.148356,5.159221e+08,0.449324,3.449897e+08,...,0.000000,True,6.322969e+08,True,1.709324e+08,True,3.449897e+08,True,4301712.0,True
3,AAHFBX,2023,1.223013e+09,0.065139,1.857744e+08,1.853291e+08,0.151535,5.378012e+08,0.439735,3.520268e+08,...,0.065139,True,6.852117e+08,True,1.857744e+08,True,3.520268e+08,True,8930222.0,True
4,AALB,2022,1.012524e+09,0.000000,3.690259e+08,3.680859e+08,0.363533,5.820688e+08,0.574869,2.130430e+08,...,0.000000,True,4.304557e+08,True,3.690259e+08,True,2.130430e+08,True,9738310.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8088,ZZWA,2023,1.292421e+09,-0.302745,1.379075e+08,1.374613e+08,0.106360,5.670301e+08,0.438735,4.291226e+08,...,-0.302745,True,7.253911e+08,True,1.379075e+08,True,4.291226e+08,True,2544710.0,True
8089,ZZZBKF,2022,2.321441e+08,0.000000,3.857396e+07,3.766445e+07,0.162246,1.222414e+08,0.526575,8.366739e+07,...,0.000000,True,1.099028e+08,True,3.857396e+07,True,8.366739e+07,True,5241496.0,True
8090,ZZZBKF,2023,3.261039e+08,0.404748,9.026569e+07,8.937578e+07,0.274072,1.924950e+08,0.590287,1.022293e+08,...,0.404748,True,1.336089e+08,True,9.026569e+07,True,1.022293e+08,True,2123737.0,True
8091,ZZZBQ,2022,1.128191e+09,0.000000,2.433491e+08,2.431163e+08,0.215492,6.427123e+08,0.569684,3.993631e+08,...,0.000000,True,4.854785e+08,True,2.433491e+08,True,3.993631e+08,True,886906.0,True


In [103]:
# expanded_df.to_csv('expanded_df_v4.csv', index=False)
# partial_df.to_csv('partial_df_v4.csv', index=False)
# historical_df.to_csv('historical_df_v3.csv', index=False)
# sparse_df.to_csv('sparse_df_v3.csv', index=False)

In [104]:
expanded_df.shape

(100000, 36)

In [105]:
partial_df.shape

(8002, 36)

In [106]:
historical_df.shape

(733, 7)

In [107]:
sparse_df.shape

(443, 3)

# 4. Merge all datasets into one

Now that the datasets have been cleaned, we merge all 4 datasets into one.

We use concat to join since the datasets have the same structure, and we only want to combine rows.

In [108]:
merged_df = pd.concat([expanded_df, partial_df], axis=0, ignore_index=True).drop_duplicates()

In [109]:
merged_df = pd.concat([merged_df, historical_df, sparse_df], axis=0, ignore_index=True).drop_duplicates()

In [110]:
merged_df

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match,expected_operatingincome,operatingincome_match,expected_operatingexpenses,operatingexpenses_match,expected_depreciationamortization,depreciationamortization_match
0,AAFRW,2000,1.103142e+09,0.000000,2.918478e+08,2.915345e+08,0.264276,6.296658e+08,0.570793,3.378181e+08,...,0.000000,False,4.734762e+08,True,2.918478e+08,True,3.378181e+08,True,7821411.0,True
1,AAFRW,2001,3.984834e+07,-0.963877,4.527289e+05,-6.468705e+04,-0.001623,1.213086e+07,0.304426,1.167813e+07,...,-0.963877,False,2.771748e+07,True,4.527289e+05,True,1.167813e+07,True,529931.0,True
2,AAFRW,2002,9.519782e+08,22.890035,1.890946e+08,1.887653e+08,0.198287,3.978879e+08,0.417959,2.087934e+08,...,22.890035,False,5.540903e+08,True,1.890946e+08,True,2.087934e+08,True,4493034.0,True
3,AAFRW,2003,1.283757e+09,0.348515,4.323393e+08,4.314507e+08,0.336084,7.196193e+08,0.560557,2.872800e+08,...,0.348515,False,5.641375e+08,True,4.323393e+08,True,2.872800e+08,True,2228554.0,True
4,AAFRW,2004,1.503795e+08,-0.882860,1.974637e+07,1.931370e+07,0.128433,6.864248e+07,0.456462,4.889611e+07,...,-0.882860,False,8.173707e+07,True,1.974637e+07,True,4.889611e+07,True,5271247.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109157,ZTMXAP,2001,1.360600e+09,,,,,,,,...,,,,,,,,,,
109158,ZTMXAP,2003,2.576235e+08,,,,,,,,...,,,,,,,,,,
109159,ZTMXAP,2005,4.837877e+08,,,,,,,,...,,,,,,,,,,
109160,ZTMXAP,2006,7.930096e+08,,,,,,,,...,,,,,,,,,,


#### Checking Merged Dataset

In [111]:
# Checking shape of merged dataset
merged_df.shape

(109162, 36)

In [112]:
# Ensure no more duplicate rows
duplicate_rows = merged_df[merged_df.duplicated()]
duplicate_rows

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match,expected_operatingincome,operatingincome_match,expected_operatingexpenses,operatingexpenses_match,expected_depreciationamortization,depreciationamortization_match


There will be NaN values from merging other datasets. 

We need to ensure that there are no completely null rows.

In [113]:
merged_df = merged_df.dropna(how='all')

if merged_df.isnull().all(axis=1).any():
    print("There are rows where all values are null.")
else:
    print("No completely null rows.")

No completely null rows.


In [114]:
# Check data types
merged_df.dtypes

Stock                                 object
Year                                   int64
Revenue                              float64
Revenue Growth                       float64
Operating Income                     float64
Net Income                           float64
Net Income Ratio                     float64
Gross Profit                         float64
Gross Profit Ratio                   float64
Operating Expenses                   float64
Cost of Revenue                      float64
EBITDA                               float64
Interest Expense                     float64
Depreciation & Amortization          float64
expected_EBITDA                      float64
EBITDA_match                          object
expected_netincome                   float64
netincome_match                       object
expected_netincome_ratio             float64
netincomeratio_match                  object
expected_grossprofit_ratio           float64
grossprofitratio_match                object
expected_g

In [115]:
# Summary of merged dataset
merged_df.describe()

Unnamed: 0,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,Cost of Revenue,...,expected_netincome,expected_netincome_ratio,expected_grossprofit_ratio,expected_grossprofit,expected_revenue,expected_revenuegrowth,expected_costofrevenue,expected_operatingincome,expected_operatingexpenses,expected_depreciationamortization
count,109162.0,109162.0,107986.0,107986.0,107986.0,107986.0,108643.0,107986.0,107986.0,108643.0,...,107986.0,107986.0,107986.0,107986.0,108643.0,107986.0,107986.0,107986.0,107986.0,107986.0
mean,2012.755327,1098778000.0,2.717415,165101900.0,164551800.0,0.148117,494668100.0,0.450134,329730100.0,604255200.0,...,164551800.0,0.148117,0.450134,494832000.0,1098923000.0,2.717415,604531700.0,165101900.0,329730100.0,5053486.0
std,7.455022,644446400.0,25.478039,164476900.0,164476500.0,0.105009,310480800.0,0.08667,206861100.0,371450000.0,...,164476500.0,0.105009,0.08667,310407000.0,644546200.0,25.478039,371470800.0,164476900.0,206861100.0,2856925.0
min,2000.0,1023372.0,-0.999527,-231717700.0,-232464700.0,-0.917878,339410.0,0.3,232301.2,478457.3,...,-232464700.0,-0.917878,0.3,339410.0,1023372.0,-0.999527,478457.3,-231717700.0,232301.2,100119.0
25%,2006.0,544870400.0,-0.462931,36945240.0,36396510.0,0.071335,236195800.0,0.3753,157093900.0,291879600.0,...,36396510.0,0.071335,0.3753,236363600.0,544875800.0,-0.462931,292176300.0,36945240.0,157093900.0,2590665.0
50%,2013.0,1090496000.0,0.0,122960500.0,122337500.0,0.148567,471730300.0,0.450051,314863100.0,585913200.0,...,122337500.0,0.148567,0.450051,472061200.0,1090677000.0,0.0,586210000.0,122960500.0,314863100.0,5050248.0
75%,2020.0,1636822000.0,0.865614,257047200.0,256473100.0,0.225439,716641100.0,0.525012,477891600.0,879632300.0,...,256473100.0,0.225439,0.525012,716751800.0,1636859000.0,0.865614,880103400.0,257047200.0,477891600.0,7526053.0
max,2024.0,2551488000.0,1635.022705,952060700.0,951088500.0,0.396542,1486397000.0,0.599999,981900400.0,1769863000.0,...,951088500.0,0.396542,0.599999,1486397000.0,2551488000.0,1635.022705,1769863000.0,952060700.0,981900400.0,9999999.0


**Year**: 

Min year is 2000, Max year is 2024. Median year is 2013. The mean year 2012 suggests a midpoint near the median, indicating that the years are evenly distributed and not heavily skewed.

**Revenue Growth**: 

Need to explore Max Revenue Growth (1635.02) as it might represent outliers.

**Operating Income, Net Income, Net Income Ratio, Gross Profit, Gross Profit Ratio, Operating Expenses**:

The negative min values may need to be investigated for accuracy or treated by replacing or refining.

**Cost of Revenue**: 

The large standard deviation (3.714500e+08) suggests outliers or extreme cases.

In [116]:
# 9468 rows still have rows with impossible negative values
negative_rows = merged_df[
    (merged_df[['Operating Income', 'Net Income', 'Net Income Ratio', 
                'Gross Profit', 'Gross Profit Ratio', 'Operating Expenses']] < 0).any(axis=1)
]

print(negative_rows)

        Stock  Year       Revenue  Revenue Growth  Operating Income  \
1       AAFRW  2001  3.984834e+07       -0.963877      4.527289e+05   
56       AALB  2006  3.147740e+08       -0.783685     -8.011229e+06   
64       AALB  2014  3.983703e+08       -0.622511     -6.636414e+06   
68       AALB  2018  6.185085e+07       -0.920547      5.859314e+04   
91      AANLW  2016  1.413008e+09        1.164323     -3.477968e+07   
...       ...   ...           ...             ...               ...   
107923  ZUTIJ  2023  3.088237e+08       -0.616497     -2.843915e+06   
107929   ZVID  2023  2.281706e+08       -0.874989     -1.362404e+07   
107963  ZYOTO  2023  1.993264e+08       -0.880139     -2.628906e+06   
107969   ZYWP  2023  6.725761e+08       -0.596408     -4.256370e+07   
107972  ZZHJF  2022  1.934900e+09        0.000000     -5.301251e+07   

          Net Income  Net Income Ratio  Gross Profit  Gross Profit Ratio  \
1      -6.468705e+04         -0.001623  1.213086e+07            0.30442

In [117]:
# Remove rows with impossible negative values
merged_df = merged_df[~(
    (merged_df[['Operating Income', 'Net Income', 'Net Income Ratio', 
                'Gross Profit', 'Gross Profit Ratio', 'Operating Expenses']] < 0).any(axis=1)
)]

In [118]:
merged_df

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match,expected_operatingincome,operatingincome_match,expected_operatingexpenses,operatingexpenses_match,expected_depreciationamortization,depreciationamortization_match
0,AAFRW,2000,1.103142e+09,0.000000,2.918478e+08,2.915345e+08,0.264276,6.296658e+08,0.570793,3.378181e+08,...,0.000000,False,4.734762e+08,True,2.918478e+08,True,3.378181e+08,True,7821411.0,True
2,AAFRW,2002,9.519782e+08,22.890035,1.890946e+08,1.887653e+08,0.198287,3.978879e+08,0.417959,2.087934e+08,...,22.890035,False,5.540903e+08,True,1.890946e+08,True,2.087934e+08,True,4493034.0,True
3,AAFRW,2003,1.283757e+09,0.348515,4.323393e+08,4.314507e+08,0.336084,7.196193e+08,0.560557,2.872800e+08,...,0.348515,False,5.641375e+08,True,4.323393e+08,True,2.872800e+08,True,2228554.0,True
4,AAFRW,2004,1.503795e+08,-0.882860,1.974637e+07,1.931370e+07,0.128433,6.864248e+07,0.456462,4.889611e+07,...,-0.882860,False,8.173707e+07,True,1.974637e+07,True,4.889611e+07,True,5271247.0,True
5,AAFRW,2005,2.193423e+09,13.585912,4.247539e+07,4.176383e+07,0.019040,7.088561e+08,0.323173,6.663807e+08,...,13.585912,False,1.484567e+09,True,4.247539e+07,True,6.663807e+08,True,9412834.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109157,ZTMXAP,2001,1.360600e+09,,,,,,,,...,,,,,,,,,,
109158,ZTMXAP,2003,2.576235e+08,,,,,,,,...,,,,,,,,,,
109159,ZTMXAP,2005,4.837877e+08,,,,,,,,...,,,,,,,,,,
109160,ZTMXAP,2006,7.930096e+08,,,,,,,,...,,,,,,,,,,


In [119]:
merged_df.describe()

Unnamed: 0,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,Cost of Revenue,...,expected_netincome,expected_netincome_ratio,expected_grossprofit_ratio,expected_grossprofit,expected_revenue,expected_revenuegrowth,expected_costofrevenue,expected_operatingincome,expected_operatingexpenses,expected_depreciationamortization
count,99684.0,99684.0,98508.0,98508.0,98508.0,98508.0,99165.0,98508.0,98508.0,99165.0,...,98508.0,98508.0,98508.0,98508.0,99165.0,98508.0,98508.0,98508.0,98508.0,98508.0
mean,2012.74998,1102225000.0,2.723434,184364100.0,183814700.0,0.165839,508073400.0,0.461223,323978300.0,594329000.0,...,183814700.0,0.165839,0.461223,508342400.0,1102402000.0,2.723434,594565900.0,184364100.0,323978300.0,5055885.0
std,7.458296,642423900.0,25.186756,159044200.0,159043000.0,0.09163,314289800.0,0.08222,201836200.0,362230000.0,...,159043000.0,0.09163,0.08222,314225500.0,642522100.0,25.186756,362198600.0,159044200.0,201836200.0,2857988.0
min,2000.0,1249005.0,-0.999131,132618.9,441.5663,2e-06,503042.8,0.3,366723.2,548192.1,...,441.5663,2e-06,0.3,503042.8,1249005.0,-0.999131,548192.1,132618.9,366723.2,100119.0
25%,2006.0,550657700.0,-0.456824,56412820.0,55822500.0,0.092728,246239400.0,0.394586,155907700.0,289924500.0,...,55822500.0,0.092728,0.394586,246541600.0,550673700.0,-0.456824,290089500.0,56412820.0,155907700.0,2592128.0
50%,2013.0,1094675000.0,0.0,141992800.0,141429300.0,0.16175,486306200.0,0.463395,309817400.0,577883800.0,...,141429300.0,0.16175,0.463395,486933700.0,1094765000.0,0.0,578147900.0,141992800.0,309817400.0,5055082.0
75%,2020.0,1637873000.0,0.871727,273388600.0,272856300.0,0.233243,738290500.0,0.531653,468207200.0,863343900.0,...,272856300.0,0.233243,0.531653,738551100.0,1637902000.0,0.871727,863634300.0,273388600.0,468207200.0,7529402.0
max,2024.0,2551488000.0,1635.022705,952060700.0,951088500.0,0.396542,1486397000.0,0.599999,978814300.0,1769863000.0,...,951088500.0,0.396542,0.599999,1486397000.0,2551488000.0,1635.022705,1769863000.0,952060700.0,978814300.0,9999999.0


In [120]:
# merged_df.to_csv('merged_df_v2.csv', index=False)