In [1]:
import requests
import random
import numpy as np
import pandas as pd
import string #Required for generating random stock tickers

# 1. Retrieving Real Stock Data with API

To start off, we obtain the Top 200 stocks by market capitalization (named as price in the code) from US-listed exchanges. 

We also include a try-except block to ensure that any errors during requests are caught and reported.

In [2]:
stock_url = "https://financialmodelingprep.com/api/v3/stock/list?apikey=NSD2m35XyjrwOoYdtKbq1JPOHlABl8CW"

try:
    stock_response = requests.get(stock_url)
    stock_response.raise_for_status()
    stock_data = stock_response.json()

    # Check API response structure
    print("Sample API Response:")
    print(stock_data[:5])

    # Filter US-listed stocks only
    us_exchanges = ['NYSE', 'NASDAQ', 'AMEX']

    valid_stocks = [
        stock for stock in stock_data
        if stock.get('exchangeShortName') in us_exchanges  # US-listed exchanges
        and stock.get('type') == 'stock'  # Only include stocks
        and stock.get('price') is not None  # Ensure price is not None
    ]

    print(f"Total valid stocks: {len(valid_stocks)}")
    print("Sample valid stocks:", valid_stocks[:5])

    # Sort by price, handling None values safely
    sorted_stocks = sorted(valid_stocks, key=lambda x: x.get('price', 0), reverse=True)

    # Get the top 200 stocks
    top_200_stocks = sorted_stocks[:200]
    stock_symbols = [stock['symbol'] for stock in top_200_stocks]

    print("Top 200 Stocks by Price (Placeholder for Market Cap):")
    print(stock_symbols)

except Exception as e:
    print(f"Error: {e}")

Sample API Response:
[{'symbol': 'NPOF.ME', 'name': 'FIZIKA Scientific and Production Association Open Joint-Stock Company', 'price': 11850, 'exchange': 'Moscow Stock Exchange', 'exchangeShortName': 'MCX', 'type': 'stock'}, {'symbol': 'TERRAREAL.BO', 'name': 'Terraform Realstate Limited', 'price': 6.5, 'exchange': 'Bombay Stock Exchange', 'exchangeShortName': 'BSE', 'type': 'stock'}, {'symbol': 'PMGOLD.AX', 'name': 'Perth Mint Gold', 'price': 17.94, 'exchange': 'Australian Securities Exchange', 'exchangeShortName': 'ASX', 'type': 'etf'}, {'symbol': 'KZMS.ME', 'name': 'The Open Joint Stock Company Krasnokamsk Metal Mesh Works', 'price': 226, 'exchange': 'Moscow Stock Exchange', 'exchangeShortName': 'MCX', 'type': 'stock'}, {'symbol': 'BAHN-B.ST', 'name': 'Bahnhof AB (publ)', 'price': 53.4, 'exchange': 'Stockholm Stock Exchange', 'exchangeShortName': 'STO', 'type': 'stock'}]
Total valid stocks: 11062
Sample valid stocks: [{'symbol': 'NSC', 'name': 'Norfolk Southern Corporation', 'price':

For each of these Top 200 stocks, we will then retrieve its annual income statement data with an API request.

The API response is stored in a list of dictionary to extract relevant metrics like Year, Revenue and Net Income.

Hence, each record corresponds to a specific stock and year, with detailed financial metrics.

Again, error handling during API requests is included and reported.

In [3]:
data = []

# For each valid stock, fetch income data and store it
for stock in stock_symbols:
    try:
        # Build the API URL for income statement
        income_url = f"https://financialmodelingprep.com/api/v3/income-statement/{stock}?period=annual&apikey=NSD2m35XyjrwOoYdtKbq1JPOHlABl8CW"
        
        # Fetch data from the API
        response = requests.get(income_url, timeout=10)  # Added timeout
        
        # Check for a successful response
        if response.status_code != 200:
            print(f"Failed to fetch data for {stock}: HTTP {response.status_code}")
            continue
        
        # Parse JSON response
        income_response = response.json()

        # Check if the income response is a non-empty list
        if isinstance(income_response, list) and income_response:
            for report in income_response:  # Loop over each year
                # Extract the year safely
                year = report.get('date', '')
                year = year.split('-')[0] if '-' in year else year

                # Create a dictionary for the stock report
                row = {
                    'Stock': stock,
                    'Year': year,
                    'Revenue': report.get('revenue'),
                    'Revenue Growth': report.get('revenueGrowth'),
                    'Operating Income': report.get('operatingIncome'),
                    'Net Income': report.get('netIncome'),
                    'Net Income Ratio': report.get('netIncomeRatio'),
                    'Gross Profit': report.get('grossProfit'),
                    'Gross Profit Ratio': report.get('grossProfitRatio'),
                    'Operating Expenses': report.get('operatingExpenses'),
                    'Cost of Revenue': report.get('costOfRevenue'),
                    'EBITDA': report.get('ebitda'),
                    'Interest Expense': report.get('interestExpense'),
                    'Depreciation & Amortization': report.get('depreciationAndAmortization')
                }
                # Append the row to the data list
                data.append(row)
        else:
            print(f"No income data available for {stock}.")
    except Exception as e:
        print(f"Error fetching data for {stock}: {e}")

In [5]:
# Output the data to check
print(f"Fetched data for {len(data)} records.")

Fetched data for 999 records.


In [6]:
df = pd.DataFrame(data)

In [7]:
df.shape

(999, 14)

In [8]:
# Checking all stocks requested are inside df
df['Stock'].nunique()

200

In [10]:
# df.to_csv('api_stock.csv')

# 2. Obtaining Base, Partial, Historical & Sparse Datasets

### Base Dataset

We create a synthetic dataset of 100,000 rows containing simulated financial metrics for various stocks across year 2000 to 2024.

Using generate_random_tickers, we create unique stocks of random lengths and create stock-year combinations. We then simulate the other financial metrics proportionally based on the revenue. For example, cost of revenue is 40-70% of revenue.

We then verified the data integrity by checking no duplicate stock-year combination exists.

In [11]:
# Define parameters for simulation
n_desired_rows = 100000  # Target number of unique rows
years = list(range(2000, 2025))  # Expand the range of years (e.g., 2000-2024)
n_unique_stocks_needed = n_desired_rows // len(years) + 1  # Number of unique stocks needed

In [12]:
# Generate fake stock tickers
def generate_random_tickers(n, length_range=(4, 6)):
    tickers = set()  # Use a set to ensure uniqueness
    while len(tickers) < n:
        length = random.randint(*length_range)  # Random length between 4 and 6
        ticker = ''.join(random.choices(string.ascii_uppercase, k=length))
        tickers.add(ticker)
    return list(tickers)

# Define the number of unique stock tickers needed
n_unique_stocks_needed = n_desired_rows // len(years) + 1

# Generate random stock tickers
base_stocks = generate_random_tickers(n_unique_stocks_needed)

In [13]:
# Generate all possible stock-year combinations
stock_year_combinations = [(stock, year) for stock in base_stocks for year in years]

# Shuffle the combinations to randomize the order
np.random.seed(42)  # Ensure reproducibility
random.shuffle(stock_year_combinations)

# Select exactly 100,000 rows
stock_year_combinations = stock_year_combinations[:n_desired_rows]

In [14]:
# Initialize an empty list to store simulated data
simulated_data = []

# Simulate data for unique stock-year combinations
for i, (stock, year) in enumerate(stock_year_combinations):
    # Use previous year's revenue for this stock (if available)
    if i > 0 and simulated_data[-1]['Stock'] == stock:
        previous_revenue = simulated_data[-1]['Revenue']
    else:
        previous_revenue = np.random.randint(1e6, 2.1e9)  # Random initial revenue if no previous data

    # Calculate revenue based on growth
    revenue_growth = np.random.uniform(-0.1, 0.2)  # Random growth rate (-10% to 20%)
    revenue = previous_revenue * (1 + revenue_growth)
    
    # Simulate financial metrics
    cost_of_revenue = revenue * np.random.uniform(0.4, 0.7)
    gross_profit = revenue - cost_of_revenue
    operating_expenses = np.random.uniform(0.2, 0.4) * revenue
    operating_income = gross_profit - operating_expenses
    depreciation_and_amortization = np.random.randint(1e5, 1e7)
    ebitda = operating_income + depreciation_and_amortization
    interest_expense = np.random.randint(1e5, 1e6)
    net_income = operating_income - interest_expense
    net_income_ratio = net_income / revenue if revenue != 0 else 0
    gross_profit_ratio = gross_profit / revenue if revenue != 0 else 0

    # Append to simulated data
    simulated_data.append({
        'Stock': stock,
        'Year': year,
        'Revenue': revenue,
        'Revenue Growth': revenue_growth,
        'Operating Income': operating_income,
        'Net Income': net_income,
        'Net Income Ratio': net_income_ratio,
        'Gross Profit': gross_profit,
        'Gross Profit Ratio': gross_profit_ratio,
        'Operating Expenses': operating_expenses,
        'Cost of Revenue': cost_of_revenue,
        'EBITDA': ebitda,
        'Interest Expense': interest_expense,
        'Depreciation & Amortization': depreciation_and_amortization,
    })

In [15]:
# Convert to a DataFrame
simulated_df = pd.DataFrame(simulated_data)

# Verify duplicates
duplicates = simulated_df.duplicated(subset=['Stock', 'Year'], keep=False)
print(f"Number of duplicates: {duplicates.sum()}")

# Display the first rows
print(simulated_df.head())

Number of duplicates: 0
    Stock  Year       Revenue  Revenue Growth  Operating Income    Net Income  \
0    BPZC  2011  1.833317e+09        0.138963      3.465545e+08  3.459331e+08   
1    BTMV  2005  2.599285e+08        0.037775      7.052226e+07  7.035744e+07   
2  SSBABI  2004  2.254691e+09        0.116600      2.666813e+08  2.663455e+08   
3  KTCOVN  2015  5.011669e+08       -0.008727      7.827428e+07  7.767409e+07   
4   HBGGH  2012  4.910126e+08        0.019958      9.390576e+07  9.374003e+07   

   Net Income Ratio  Gross Profit  Gross Profit Ratio  Operating Expenses  \
0          0.188692  9.991022e+08            0.544970        6.525477e+08   
1          0.270680  1.299350e+08            0.499887        5.941274e+07   
2          0.118130  7.179707e+08            0.318434        4.512894e+08   
3          0.154986  2.218030e+08            0.442573        1.435287e+08   
4          0.190912  2.877335e+08            0.586000        1.938277e+08   

   Cost of Revenue        

Importantly, we concatenate the simulated_df with the initial df containing the real stocks to generate the base dataset.

In [16]:
expanded_df = pd.concat([df, simulated_df], ignore_index=True)

  expanded_df = pd.concat([df, simulated_df], ignore_index=True)


In [17]:
expanded_df.to_csv('expanded_dataset_v5.csv', index=False)

In [18]:
print(f"Expanded dataset has {len(expanded_df)} rows.")

Expanded dataset has 100999 rows.


### Partial Dataset

The partial dataset overlaps with the base dataset, but has missing values in some columns.

The year range is limited to 2022 to 2023.

In [19]:
# Create a deep copy of expanded_df, with filters by 2022, 2023
partial_df = expanded_df[expanded_df['Year'].isin([2022,2023])].copy()

#Introduce missing data in 'Revenue', 'Net Income' and 'Operating Expenses'
partial_df.loc[partial_df.sample(frac=0.2).index, 'Revenue'] = np.nan #Within revenue, introduce 20% of missing data
partial_df.loc[partial_df.sample(frac=0.1).index, 'Net Income'] = np.nan #Within net income, introduce 10% of missing data
partial_df.loc[partial_df.sample(frac=0.3).index, 'Operating Expenses'] = np.nan #Within operating expenses, introduce 30% of missing data

In [20]:
# partial_df.to_csv('partial_dataset_v5.csv', index=False)

In [21]:
partial_df.head()

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,Cost of Revenue,EBITDA,Interest Expense,Depreciation & Amortization
1017,TLSNGU,2023,1377147000.0,0.166164,322826300.0,322542300.0,0.23421,631195600.0,0.458336,308369300.0,745951900.0,328327500.0,284064.0,5501218.0
1078,ALAD,2022,,0.125184,-82214260.0,-82384900.0,-0.040235,732938900.0,0.35795,,1314665000.0,-78144260.0,170640.0,4070001.0
1088,ZTSXEF,2023,319470100.0,0.018995,32940550.0,32321220.0,0.101171,106201700.0,0.332431,,213268400.0,39944390.0,619332.0,7003836.0
1103,BSFLIH,2023,2243174000.0,0.18494,133695700.0,133117700.0,0.059343,857529900.0,0.382284,,1385644000.0,134973700.0,578007.0,1278049.0
1117,XOJED,2022,656593500.0,0.037804,-4209403.0,-5198303.0,-0.007917,228082600.0,0.347373,,428510900.0,4929404.0,988900.0,9138807.0


### Historical Dataset

The historical dataset introduces a larger range of years (but randomized), with mismatched data and duplicate records.

Stocks in this dataset are referenced from the stocks in the base dataset.

In [22]:
historical_data = []
historical_years = range(2010, 2021) # Year 2010 to 2021
historical_stocks = expanded_df['Stock']

In [23]:
# Function to simulate revenue, with missing and invalid values
def simulate_revenue():
    # 20% chance of missing revenue
    revenue = np.random.randint(1e6, 2.1e9) if np.random.rand() > 0.2 else None
    
    # 5% chance for invalid negative revenue
    if np.random.rand() < 0.05:
        revenue = np.random.randint(-1e9, -1e6)
    return revenue

In [24]:
# Function to simulate 'Cost of Revenue', allowing for missing or mismatched values
def simulate_cost_of_revenue(revenue):
    if revenue is None or np.random.rand() < 0.1:  # 10% chance of missing value
        return None
    return revenue * np.random.uniform(0.3, 0.8)

In [25]:
# Function to introduce random mismatches in stock names
def simulate_stock_name(stock):
    if np.random.rand() < 0.05:  # 5% chance for mismatch
        return stock + ' .end'
    return stock

In [26]:
# Function to simulate random years, including errors outside the range
def simulate_year():
    if np.random.rand() > 0.05:  # 95% chance of random error
        return np.random.randint(2130, 2200)
    return np.random.choice(historical_years)

In [27]:
# Function to calculate gross profit if valid inputs are provided
def calculate_gross_profit(revenue, cost_of_revenue):
    if revenue is not None and cost_of_revenue is not None:
        return revenue - cost_of_revenue
    return None

In [28]:
# Simulate 10,000 rows of historical data
for _ in range(10000):
    # Simulate each field using the functions
    stock = simulate_stock_name(np.random.choice(historical_stocks))
    year = simulate_year()
    revenue = simulate_revenue()
    cost_of_revenue = simulate_cost_of_revenue(revenue)
    gross_profit = calculate_gross_profit(revenue, cost_of_revenue)

    historical_data.append({
        'Stock': stock,
        'Year': year,
        'Revenue': revenue,
        'Cost of Revenue': cost_of_revenue,
        'Gross Profit': gross_profit
    })

In [29]:
historical_df = pd.DataFrame(historical_data)

In [30]:
# historical_df.to_csv('historical_dataset_v4.csv', index=False)

### Sparse Dataset

Finally, the sparse dataset only contains 100 random stocks, with year and revenue as their only metrics.

Like the Historical dataset, stocks are refrenced from those in the base dataset.

50% of the revenue column will be empty.

In [31]:
sparse_data = []
sparse_years = range(2000, 2009)
sparse_stocks = expanded_df['Stock']

In [32]:
for stock in np.random.choice(sparse_stocks, size=100, replace=False): #Select 100 random stocks
    for year in sparse_years:
        if np.random.rand() > 0.5: #For 50% of the year column, assign a random revenue. The other 50% will be uninitialized
            revenue = np.random.randint(1e6, 2.1e9)
            
            sparse_data.append({
                'Stock': stock,
                'Year': year,
                'Revenue': revenue
            })

In [33]:
sparse_df = pd.DataFrame(sparse_data)

In [34]:
# sparse_df.to_csv('sparse_dataset_v4.csv', index=False)

# 3. Data Preprocessing

### Check whether every stock-year pair is unique

In [35]:
def check_unique_stock_year_pairs(df, stock_col='Stock', year_col='Year'):
    # Create a combined column to check for duplicates
    unique_combinations = df[[stock_col, year_col]].drop_duplicates()

    # Compare the length of unique combinations with the original DataFrame
    if len(unique_combinations) == len(df):
        print("All stock-year pairs are unique.")
        return True
    else:
        print("There are duplicate stock-year pairs.")
        # Keep no duplicates to ensure only rows with unique Stock-Year pairs remain
        duplicates = df[df.duplicated(subset=[stock_col, year_col], keep=False)]
        print("Duplicate entries:")
        print(duplicates)
        print(duplicates.shape)
        return False

In [36]:
# 10 duplicate pairs
check_unique_stock_year_pairs(expanded_df)

There are duplicate stock-year pairs.
Duplicate entries:
    Stock  Year       Revenue  Revenue Growth  Operating Income   Net Income  \
305   TDY  2023  5.635500e+09             NaN      1.046800e+09  885700000.0   
306   TDY  2023  5.458600e+09             NaN      9.720000e+08  788600000.0   
420   DPZ  2023  4.479358e+09             NaN      8.195190e+08  519118000.0   
421   DPZ  2023  4.537158e+09             NaN      7.679250e+08  452263000.0   
560   KAI  2022  9.047390e+08             NaN      1.712820e+08  120928000.0   
561   KAI  2022  7.865790e+08             NaN      1.167100e+08   84043000.0   
670   SNA  2022  4.492800e+09             NaN      1.207200e+09  911700000.0   
671   SNA  2022  4.252000e+09             NaN      1.123500e+09  820500000.0   
765  CDNS  2022  3.561718e+09             NaN      1.073686e+09  848952000.0   
766  CDNS  2022  2.988244e+09             NaN      7.790890e+08  695955000.0   

     Net Income Ratio  Gross Profit  Gross Profit Ratio  Opera

False

In [37]:
check_unique_stock_year_pairs(partial_df)

All stock-year pairs are unique.


True

In [38]:
# 298 duplicate pairs
check_unique_stock_year_pairs(historical_df)

There are duplicate stock-year pairs.
Duplicate entries:
       Stock  Year       Revenue  Cost of Revenue  Gross Profit
21     QCIBQ  2173 -7.091537e+08    -2.220766e+08 -4.870771e+08
35    ZDQBNG  2198  1.810861e+09     1.054031e+09  7.568299e+08
90    UAHQYG  2161  1.119969e+09     5.023274e+08  6.176421e+08
107   IOPWJI  2185           NaN              NaN           NaN
112   MNQKWG  2174  2.173614e+07     9.765953e+06  1.197018e+07
...      ...   ...           ...              ...           ...
9826    GAUD  2195           NaN              NaN           NaN
9852    SAAZ  2193  1.757162e+09     1.375122e+09  3.820405e+08
9862    SSFY  2199  1.954715e+09     1.181685e+09  7.730297e+08
9881  FWNPUT  2194 -1.812004e+08    -7.539939e+07 -1.058010e+08
9980    PRCQ  2143  1.402420e+09     9.505610e+08  4.518590e+08

[298 rows x 5 columns]
(298, 5)


False

In [39]:
# 2 duplicate pairs
check_unique_stock_year_pairs(sparse_df)

There are duplicate stock-year pairs.
Duplicate entries:
     Stock  Year     Revenue
44   INPCQ  2008  1989476413
200  INPCQ  2008  1040112774
(2, 3)


False

### Dropping duplicate stock-year pairs

In [40]:
def drop_duplicate_stock_year_pairs(df, stock_col='Stock', year_col='Year'):
    print("DataFrame after dropping all duplicates:")
    return df[~df.duplicated(subset=['Stock', 'Year'], keep=False)]

In [41]:
expanded_df = drop_duplicate_stock_year_pairs(expanded_df)
print(expanded_df)

DataFrame after dropping all duplicates:
         Stock  Year       Revenue  Revenue Growth  Operating Income  \
0        BRK-A  2023  3.644820e+11             NaN      1.231960e+11   
1        BRK-A  2022  2.341900e+11             NaN     -2.698500e+10   
2        BRK-A  2021  3.546360e+11             NaN     -8.242500e+10   
3        BRK-A  2020  2.862560e+11             NaN     -1.135820e+11   
4        BRK-A  2019  3.272230e+11             NaN     -7.723400e+10   
...        ...   ...           ...             ...               ...   
100994  PTLEBY  2005  1.333766e+09       -0.082426      2.334087e+08   
100995  PGQDKT  2024  1.548388e+09        0.145031      2.106748e+08   
100996    BPCR  2020  4.912654e+08        0.178375      6.713231e+07   
100997  FOXTWC  2019  2.127840e+09        0.034832      1.880026e+08   
100998    VFOK  2021  7.101194e+08        0.051971      1.873849e+08   

          Net Income  Net Income Ratio  Gross Profit  Gross Profit Ratio  \
0       9.622300e+

In [42]:
historical_df = drop_duplicate_stock_year_pairs(historical_df)
print(historical_df)

DataFrame after dropping all duplicates:
       Stock  Year       Revenue  Cost of Revenue  Gross Profit
0     IVMAGX  2161           NaN              NaN           NaN
1       HIJQ  2149  1.839813e+08     1.345906e+08  4.939068e+07
2      WAKFT  2151           NaN              NaN           NaN
3       QHIN  2133  1.891520e+09     8.866202e+08  1.004900e+09
4     IABVRF  2148  1.850278e+09     8.869205e+08  9.633574e+08
...      ...   ...           ...              ...           ...
9995    OXVC  2154  8.033017e+08              NaN           NaN
9996    JZDY  2142  6.132366e+08              NaN           NaN
9997    PDCX  2135  1.120415e+09     7.403297e+08  3.800858e+08
9998    YPOT  2182  1.136122e+08     7.777145e+07  3.584073e+07
9999  KBNXVP  2160  1.025777e+09     7.303527e+08  2.954241e+08

[9702 rows x 5 columns]


In [43]:
sparse_df = drop_duplicate_stock_year_pairs(sparse_df)
print(sparse_df)

DataFrame after dropping all duplicates:
    Stock  Year     Revenue
0    OBWA  2000  1709725917
1    OBWA  2001   675481498
2    OBWA  2002   783243374
3    OBWA  2004  1976308486
4    OBWA  2005  1756793979
..    ...   ...         ...
475  RXDK  2003   718562045
476  RXDK  2004  1337162477
477  RXDK  2005    94388513
478  RXDK  2006   743663602
479  RXDK  2008    48647460

[478 rows x 3 columns]


### Check for duplicates

In [44]:
# Base dataset
print(f"Duplicate entries (Base Dataset): {expanded_df.duplicated().sum()}")
print(f"{round((expanded_df.duplicated().sum() / partial_df.shape[0]) * 100, 2)}% rows are duplicate.")

# Partial dataset
print(f"Duplicate entries (Partial Dataset): {partial_df.duplicated().sum()}")
print(f"{round((partial_df.duplicated().sum() / partial_df.shape[0]) * 100, 2)}% rows are duplicate.")

# Historical dataset
print(f"Duplicate entries (Historical Dataset): {historical_df.duplicated().sum()}")
print(f"{round((historical_df.duplicated().sum() / historical_df.shape[0]) * 100, 2)}% rows are duplicate.")

# Sparse dataset
print(f"Duplicate entries (Sparse Dataset): {sparse_df.duplicated().sum()}")
print(f"{round((sparse_df.duplicated().sum() / sparse_df.shape[0]) * 100, 2)}% rows are duplicate.")

Duplicate entries (Base Dataset): 0
0.0% rows are duplicate.
Duplicate entries (Partial Dataset): 0
0.0% rows are duplicate.
Duplicate entries (Historical Dataset): 0
0.0% rows are duplicate.
Duplicate entries (Sparse Dataset): 0
0.0% rows are duplicate.


### Check for inconsistencies

This function prints the number of unique values per column. From the unique values, we can also tell any inconsistencies such as impossible negatives in revenue that we need to manage.

In [45]:
# Loop through all columns and print unique values
def no_of_unique_values(df):
    for col in df.columns:
        print(f"Column Name (Base Dataset): {col}")
        print(f"Unique Values: {df[col].unique()}")
        print(f"Number of Unique Values: {df[col].nunique()}")
        print('-' * 60) # To demarcate columns

In [46]:
no_of_unique_values(expanded_df)

Column Name (Base Dataset): Stock
Unique Values: ['BRK-A' 'NVR' 'BKNG' ... 'GAMTND' 'JTLKYV' 'INPCQ']
Number of Unique Values: 4201
------------------------------------------------------------
Column Name (Base Dataset): Year
Unique Values: ['2023' '2022' '2021' '2020' '2019' '2024' '2018' '2017' '2016' 2011 2005
 2004 2015 2012 2003 2002 2019 2010 2008 2020 2017 2018 2007 2023 2014
 2021 2000 2006 2001 2009 2013 2024 2022 2016]
Number of Unique Values: 34
------------------------------------------------------------
Column Name (Base Dataset): Revenue
Unique Values: [3.64482000e+11 2.34190000e+11 3.54636000e+11 ... 4.91265446e+08
 2.12784021e+09 7.10119385e+08]
Number of Unique Values: 100947
------------------------------------------------------------
Column Name (Base Dataset): Revenue Growth
Unique Values: [       nan 0.1389629  0.03777467 ... 0.17837523 0.03483163 0.05197081]
Number of Unique Values: 100000
------------------------------------------------------------
Column Name (B

In [47]:
no_of_unique_values(partial_df)

Column Name (Base Dataset): Stock
Unique Values: ['TLSNGU' 'ALAD' 'ZTSXEF' ... 'CSVQR' 'GNNM' 'BYKC']
Number of Unique Values: 4001
------------------------------------------------------------
Column Name (Base Dataset): Year
Unique Values: [2023 2022]
Number of Unique Values: 2
------------------------------------------------------------
Column Name (Base Dataset): Revenue
Unique Values: [1.37714747e+09            nan 3.19470092e+08 ... 2.82095823e+08
 6.53232569e+08 1.66379394e+09]
Number of Unique Values: 6398
------------------------------------------------------------
Column Name (Base Dataset): Revenue Growth
Unique Values: [ 0.16616382  0.12518443  0.01899548 ...  0.18353498 -0.00522837
 -0.05541769]
Number of Unique Values: 7997
------------------------------------------------------------
Column Name (Base Dataset): Operating Income
Unique Values: [ 3.22826330e+08 -8.22142569e+07  3.29405494e+07 ...  8.11277342e+07
  2.77802526e+08  9.51786557e+07]
Number of Unique Values: 7997

Significant inconsistencies in the Historical dataset include:
1. Impossible negatives
2. Impossible years (e.g. 2177)
3. Stock names with '.end'

In [49]:
no_of_unique_values(historical_df)

Column Name (Base Dataset): Stock
Unique Values: ['IVMAGX' 'HIJQ' 'WAKFT' ... 'MAVKN .end' 'KXAKD' 'KPGH .end']
Number of Unique Values: 4181
------------------------------------------------------------
Column Name (Base Dataset): Year
Unique Values: [2161 2149 2151 2133 2148 2187 2011 2183 2155 2134 2140 2180 2172 2142
 2164 2136 2144 2163 2165 2170 2135 2182 2158 2153 2154 2012 2198 2190
 2141 2145 2159 2179 2177 2185 2196 2150 2162 2132 2143 2166 2169 2191
 2010 2194 2138 2013 2173 2199 2018 2193 2178 2167 2014 2188 2147 2192
 2137 2186 2168 2181 2197 2156 2152 2020 2176 2157 2195 2146 2131 2184
 2174 2171 2139 2017 2016 2130 2175 2160 2019 2015 2189]
Number of Unique Values: 81
------------------------------------------------------------
Column Name (Base Dataset): Revenue
Unique Values: [           nan 1.83981263e+08 1.89151975e+09 ... 1.12041550e+09
 1.13612172e+08 1.02577676e+09]
Number of Unique Values: 7886
------------------------------------------------------------
Column Na

In [50]:
no_of_unique_values(sparse_df)

Column Name (Base Dataset): Stock
Unique Values: ['OBWA' 'WQCQ' 'DWUEJW' 'KUPZ' 'XGSQ' 'KTZQO' 'IFUPUP' 'KZQQE' 'INPCQ'
 'UQWMUJ' 'WXINV' 'XCTO' 'WHMRTX' 'SQBVA' 'MPLIF' 'FHZHQ' 'UBFU' 'QOXAP'
 'ECOLG' 'BYGJ' 'WKJDZN' 'RBLMC' 'FZMC' 'XTHSP' 'EXPWC' 'AATAD' 'MJMFZ'
 'KNEKVS' 'QZKDW' 'VJPB' 'NGLV' 'GPHJ' 'BPCR' 'SYHQG' 'XNUMQ' 'PDHXAO'
 'HIMX' 'KEYA' 'KQTXYR' 'WHQKOA' 'GAMTND' 'ONYIVM' 'KELYBC' 'TRHRT'
 'UQGORK' 'IWZNGS' 'JZHV' 'AWKKL' 'UELONW' 'FFOS' 'EILE' 'BZLFY' 'MDXYBL'
 'BTMV' 'QOSVLN' 'CPQFGQ' 'XUHOLB' 'RGEBQC' 'AXDPOS' 'BLMPI' 'AFTEH'
 'MMPGMU' 'LII' 'WZON' 'RYEVSY' 'MMZWN' 'ADPO' 'TVGS' 'WHCV' 'EMHU'
 'ZKGDWZ' 'LQGH' 'QTVEER' 'MLOR' 'LEODET' 'SSFL' 'YRXYX' 'HBVHWO' 'ZWEBIC'
 'YGOLQ' 'IVPR' 'ILNAE' 'BVCW' 'FISUI' 'TWRC' 'XXTIXH' 'YIXPOS' 'AQCW'
 'GXOT' 'TNWI' 'HIJQ' 'YDEGUS' 'OGTRVU' 'HGBQ' 'MVAMHL' 'NAMUF' 'DTWOH'
 'KXEBOL' 'RXDK']
Number of Unique Values: 99
------------------------------------------------------------
Column Name (Base Dataset): Year
Unique Values: [2000 2001 2

We also check the dtypes and basic info of each column for any null columns.

In [51]:
print('Base Dataset')
print(expanded_df.dtypes)
print(expanded_df.info())
print('-' * 60)

print('Partial Dataset')
print(partial_df.dtypes)
print(partial_df.info())
print('-' * 60)

print('Historical Dataset')
print(historical_df.dtypes) 
print(historical_df.info())
print('-' * 60)

print('Sparse Dataset')
print(sparse_df.dtypes)
print(sparse_df.info())
print('-' * 60)

Base Dataset
Stock                           object
Year                            object
Revenue                        float64
Revenue Growth                 float64
Operating Income               float64
Net Income                     float64
Net Income Ratio               float64
Gross Profit                   float64
Gross Profit Ratio             float64
Operating Expenses             float64
Cost of Revenue                float64
EBITDA                         float64
Interest Expense               float64
Depreciation & Amortization    float64
dtype: object
<class 'pandas.core.frame.DataFrame'>
Index: 100989 entries, 0 to 100998
Data columns (total 14 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Stock                        100989 non-null  object 
 1   Year                         100989 non-null  object 
 2   Revenue                      100989 non-null  float64
 3   Revenue Growth       

### Check and filter out impossible years

This function checks whether years in the 'year' column are greater than a maximum year.

The 'year_column' is a keyword argument that fits the 'year' column. 'max_year' is also a keyword argument that represents the max year.

In this case, we find out whether any column exceeds year 2024.

In [54]:
def check_years(df, year_column, max_year):
    # Ensure that 'Year' column is numeric
    df[year_column] = pd.to_numeric(df[year_column])
    
    # Flag rows where the year is out of range
    df['is_out_of_range'] = df[year_column] > max_year

    # out_of_range represents years that exceed max_year
    # valid_rows represents years before year max_year
    out_of_range = df[df['is_out_of_range']]
    valid_rows = df[~df['is_out_of_range']]
    
    return {
        "out_of_range": out_of_range,
        "valid_rows": valid_rows,
        "flagged_df": df
    }

In [55]:
# Base dataset has no impossible years
expanded_df_years = check_years(expanded_df, year_column='Year', max_year=2024)
print("Out-of-range rows:")
print(expanded_df_years["out_of_range"])

print("\nValid rows:")
print(expanded_df_years["valid_rows"])

print("\nDataframe with flagged column:")
print(expanded_df_years["flagged_df"])

Out-of-range rows:
Empty DataFrame
Columns: [Stock, Year, Revenue, Revenue Growth, Operating Income, Net Income, Net Income Ratio, Gross Profit, Gross Profit Ratio, Operating Expenses, Cost of Revenue, EBITDA, Interest Expense, Depreciation & Amortization, is_out_of_range]
Index: []

Valid rows:
         Stock  Year       Revenue  Revenue Growth  Operating Income  \
0        BRK-A  2023  3.644820e+11             NaN      1.231960e+11   
1        BRK-A  2022  2.341900e+11             NaN     -2.698500e+10   
2        BRK-A  2021  3.546360e+11             NaN     -8.242500e+10   
3        BRK-A  2020  2.862560e+11             NaN     -1.135820e+11   
4        BRK-A  2019  3.272230e+11             NaN     -7.723400e+10   
...        ...   ...           ...             ...               ...   
100994  PTLEBY  2005  1.333766e+09       -0.082426      2.334087e+08   
100995  PGQDKT  2024  1.548388e+09        0.145031      2.106748e+08   
100996    BPCR  2020  4.912654e+08        0.178375     

In [56]:
# Partial dataset has no impossible years
partial_df_years = check_years(partial_df, year_column='Year', max_year=2024)
print("Out-of-range rows:")
print(partial_df_years["out_of_range"])

print("\nValid rows:")
print(partial_df_years["valid_rows"])

print("\nDataframe with flagged column:")
print(partial_df_years["flagged_df"])

Out-of-range rows:
Empty DataFrame
Columns: [Stock, Year, Revenue, Revenue Growth, Operating Income, Net Income, Net Income Ratio, Gross Profit, Gross Profit Ratio, Operating Expenses, Cost of Revenue, EBITDA, Interest Expense, Depreciation & Amortization, is_out_of_range]
Index: []

Valid rows:
         Stock  Year       Revenue  Revenue Growth  Operating Income  \
1017    TLSNGU  2023  1.377147e+09        0.166164      3.228263e+08   
1078      ALAD  2022           NaN        0.125184     -8.221426e+07   
1088    ZTSXEF  2023  3.194701e+08        0.018995      3.294055e+07   
1103    BSFLIH  2023  2.243174e+09        0.184940      1.336957e+08   
1117     XOJED  2022  6.565935e+08        0.037804     -4.209403e+06   
...        ...   ...           ...             ...               ...   
100931    SBQP  2023           NaN        0.006929      3.567775e+08   
100942  AOBPTS  2023           NaN        0.190997      6.115073e+07   
100945    QCAD  2023  6.532326e+08        0.183535     

In [57]:
# 9178 rows in Historical dataset are out of range
historical_df_years = check_years(historical_df, year_column='Year', max_year=2024)
print("Out-of-range rows:")
print(historical_df_years["out_of_range"])

print("\nValid rows:")
print(historical_df_years["valid_rows"])

print("\nDataframe with flagged column:")
print(historical_df_years["flagged_df"])

Out-of-range rows:
       Stock  Year       Revenue  Cost of Revenue  Gross Profit  \
0     IVMAGX  2161           NaN              NaN           NaN   
1       HIJQ  2149  1.839813e+08     1.345906e+08  4.939068e+07   
2      WAKFT  2151           NaN              NaN           NaN   
3       QHIN  2133  1.891520e+09     8.866202e+08  1.004900e+09   
4     IABVRF  2148  1.850278e+09     8.869205e+08  9.633574e+08   
...      ...   ...           ...              ...           ...   
9995    OXVC  2154  8.033017e+08              NaN           NaN   
9996    JZDY  2142  6.132366e+08              NaN           NaN   
9997    PDCX  2135  1.120415e+09     7.403297e+08  3.800858e+08   
9998    YPOT  2182  1.136122e+08     7.777145e+07  3.584073e+07   
9999  KBNXVP  2160  1.025777e+09     7.303527e+08  2.954241e+08   

      is_out_of_range  
0                True  
1                True  
2                True  
3                True  
4                True  
...               ...  
9995    

We filter out rows that are out of range, and keep only rows within the valid range of until 2024.

Unique values in the 'Year' column are checked again.

In [58]:
# Keep only rows that have no impossible years
historical_df = historical_df.loc[historical_df_years['valid_rows'].index]

In [59]:
historical_df['Year'].unique()

array([2011, 2012, 2010, 2013, 2018, 2014, 2020, 2017, 2016, 2019, 2015],
      dtype=int64)

In [60]:
# Sparse dataset has no impossible years
sparse_df_years = check_years(sparse_df, year_column='Year', max_year=2099)
print("Out-of-range rows:")
print(sparse_df_years["out_of_range"])

print("\nValid rows:")
print(sparse_df_years["valid_rows"])

print("\nDataframe with flagged column:")
print(sparse_df_years["flagged_df"])

Out-of-range rows:
Empty DataFrame
Columns: [Stock, Year, Revenue, is_out_of_range]
Index: []

Valid rows:
    Stock  Year     Revenue  is_out_of_range
0    OBWA  2000  1709725917            False
1    OBWA  2001   675481498            False
2    OBWA  2002   783243374            False
3    OBWA  2004  1976308486            False
4    OBWA  2005  1756793979            False
..    ...   ...         ...              ...
475  RXDK  2003   718562045            False
476  RXDK  2004  1337162477            False
477  RXDK  2005    94388513            False
478  RXDK  2006   743663602            False
479  RXDK  2008    48647460            False

[478 rows x 4 columns]

Dataframe with flagged column:
    Stock  Year     Revenue  is_out_of_range
0    OBWA  2000  1709725917            False
1    OBWA  2001   675481498            False
2    OBWA  2002   783243374            False
3    OBWA  2004  1976308486            False
4    OBWA  2005  1756793979            False
..    ...   ...         ...

### Rename values in 'Stock' column that end with ' .end' in Historical Dataset

We replace both '.end' and ' .end' at the end of the string using regex.

After that, we print out the stock names to see whether they are all named correctly.

In [61]:
historical_df['Stock'] = historical_df['Stock'].str.replace(r'\s?\.end$', '', regex=True)

In [62]:
historical_df['Stock'].unique()

array(['SQVE', 'JMBXBF', 'AOSYFW', 'HQIMS', 'LPFFOX', 'HQCU', 'YLVJB',
       'SYKZY', 'SCAXI', 'UOSVNX', 'VCXQI', 'SLJLVN', 'ERYZFK', 'PFNWH',
       'CLAIHO', 'OTUJVZ', 'BTMV', 'PEQY', 'AMEID', 'QCIBQ', 'PZVSHX',
       'CPCK', 'FFWMSO', 'NIUK', 'GALM', 'GRXYDL', 'SHET', 'SFIV', 'TTTB',
       'QATG', 'ERZIRJ', 'NDBSR', 'MDTE', 'FSRMGP', 'YQZUN', 'ORAH',
       'TUTT', 'ZNYWF', 'XEUZ', 'UHDNAN', 'RKMTI', 'DHVAK', 'VCPQBY',
       'HKEPUZ', 'INRMC', 'AMPO', 'XLZUD', 'CEBIIR', 'VJAF', 'UANYAV',
       'FYDJQ', 'QCBIW', 'EDIZQ', 'IPUWKK', 'KGEJ', 'JBTS', 'EQREQ',
       'RLRY', 'NUVPP', 'ZACVOO', 'CBMGBV', 'XDGOZD', 'OFVZT', 'AAZLH',
       'UZLPI', 'FVMUD', 'IORX', 'XCJMZ', 'AZTQHC', 'XFHP', 'FZMC',
       'TMHMYR', 'OXDRI', 'PTULN', 'EXLT', 'BYFR', 'KHEBQ', 'VLZBKQ',
       'YRMTOC', 'LUTNZ', 'KEYA', 'EZAIGH', 'RHGYB', 'UQGORK', 'KKAEMI',
       'DIJA', 'RSKYN', 'JICHI', 'UQWMUJ', 'AXKU', 'CLDTV', 'LFCFVC',
       'PNIJNW', 'PPZO', 'YCPX', 'RFYDEH', 'YTXINO', 'GIADHZ', 'MAJKA',
      

### Check for impossible negative values

We know that the following columns cannot have negative values:
**Revenue, Operating Expenses, Cost of Revenue, Interest Expense, Depreciation & Amortization**

We iterate this function over each column to check for any impossible negative values.

We then create a valid_columns list to only keep rows with no impossible negative values.

In [63]:
def drop_rows_with_negatives(df):
    columns_to_check = ['Revenue', 'Operating Expenses', 'Cost of Revenue', 'Interest Expense', 'Depreciation & Amortization']

    # For each column within columns that are not supposed to have negative values
    for column in columns_to_check:
        if column in df.columns: # Check first if the column exist in the datasset
            # Check for negative values in the column
            has_negative = df[column].lt(0).any()
            if has_negative:
                print(f"Column '{column}' contains negative values.")
            else:
                print(f"Column '{column} does not contain any negative values.'")
        else: # If the column is not present in the dataset, we skip it
            print(f"Column '{column}' is not present in the DataFrame. Skipping.")
            
    # Only keep rows with no impossible negative values
    valid_columns = [col for col in columns_to_check if col in df.columns]
    rows_with_negatives = df[df[valid_columns].lt(0).any(axis=1)]
    return df.drop(rows_with_negatives.index)

In [64]:
expanded_df = drop_rows_with_negatives(expanded_df)

Column 'Revenue' contains negative values.
Column 'Operating Expenses' contains negative values.
Column 'Cost of Revenue' contains negative values.
Column 'Interest Expense' contains negative values.
Column 'Depreciation & Amortization' contains negative values.


In [65]:
partial_df = drop_rows_with_negatives(partial_df)

Column 'Revenue does not contain any negative values.'
Column 'Operating Expenses does not contain any negative values.'
Column 'Cost of Revenue does not contain any negative values.'
Column 'Interest Expense does not contain any negative values.'
Column 'Depreciation & Amortization does not contain any negative values.'


In [66]:
historical_df = drop_rows_with_negatives(historical_df)

Column 'Revenue' contains negative values.
Column 'Operating Expenses' is not present in the DataFrame. Skipping.
Column 'Cost of Revenue' contains negative values.
Column 'Interest Expense' is not present in the DataFrame. Skipping.
Column 'Depreciation & Amortization' is not present in the DataFrame. Skipping.


In [67]:
sparse_df = drop_rows_with_negatives(sparse_df)

Column 'Revenue does not contain any negative values.'
Column 'Operating Expenses' is not present in the DataFrame. Skipping.
Column 'Cost of Revenue' is not present in the DataFrame. Skipping.
Column 'Interest Expense' is not present in the DataFrame. Skipping.
Column 'Depreciation & Amortization' is not present in the DataFrame. Skipping.


### Dealing with missing values

In [68]:
# Calculate the percentage of missing values
def missing_values_check(df):
    missing_values = df.isnull().sum()
    percent = (missing_values * 100) / len(df)
    
    # Create a DataFrame to display missing values information
    missing_df = pd.DataFrame({
        'Column': df.columns,
        'No. of Missing Values': missing_values.values,
        'Percentage': percent.values
    }).sort_values(by='Percentage', ascending=False)

    return missing_df

In [69]:
print(missing_values_check(expanded_df))
print('-'*80)
print(missing_values_check(partial_df))
print('-'*80)
print(missing_values_check(historical_df))
print('-'*80)
print(missing_values_check(sparse_df))

                         Column  No. of Missing Values  Percentage
3                Revenue Growth                    905    0.896883
0                         Stock                      0    0.000000
1                          Year                      0    0.000000
2                       Revenue                      0    0.000000
4              Operating Income                      0    0.000000
5                    Net Income                      0    0.000000
6              Net Income Ratio                      0    0.000000
7                  Gross Profit                      0    0.000000
8            Gross Profit Ratio                      0    0.000000
9            Operating Expenses                      0    0.000000
10              Cost of Revenue                      0    0.000000
11                       EBITDA                      0    0.000000
12             Interest Expense                      0    0.000000
13  Depreciation & Amortization                      0    0.00

The number of missing values in the Partial and Historical dataset hold a significant percentage of the datasets, which affects analysis later on.

We will try to fill in the missing values as best as we can, based on previous year revenues or calculations between columns.

### Managing Missing Values

### Expanded Dataset: Revenue Growth

In [70]:
# For each stock, fill in revenue growth NaN based on revenue in the previous year
# Ensure 'Year' column is in integer format
expanded_df['Year'] = expanded_df['Year'].astype(int)

# Sort by stock and year
expanded_df = expanded_df.sort_values(by=['Stock', 'Year'])

# Calculate revenue growth where it is NaN
expanded_df['Revenue Growth'] = expanded_df.groupby('Stock')['Revenue'].apply(
    lambda x: x.pct_change(fill_method=None).fillna(0)
).reset_index(drop=True)

In [71]:
# Confirm that the only NaN left are stocks with no prior revenues
expanded_df_NaN = expanded_df[expanded_df['Revenue Growth'].isna()]

In [72]:
expanded_df_NaN

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,Cost of Revenue,EBITDA,Interest Expense,Depreciation & Amortization,is_out_of_range
100942,AOBPTS,2023,6.097996e+08,,6.115073e+07,6.073633e+07,0.099600,2.548950e+08,0.417998,1.937443e+08,3.549046e+08,6.768917e+07,414404.0,6538445.0,False
100987,BDRE,2003,6.815540e+08,,9.123404e+07,9.055099e+07,0.132860,2.525246e+08,0.370513,1.612906e+08,4.290293e+08,9.950987e+07,683050.0,8275823.0,False
100976,BIPOE,2024,2.108123e+09,,5.114961e+08,5.111623e+08,0.242473,1.000883e+09,0.474774,4.893866e+08,1.107240e+09,5.187035e+08,333708.0,7207447.0,False
100985,BNDGN,2024,1.789047e+09,,1.230134e+08,1.228087e+08,0.068645,6.007443e+08,0.335790,4.777309e+08,1.188303e+09,1.263938e+08,204687.0,3380443.0,False
100996,BPCR,2020,4.912654e+08,,6.713231e+07,6.666666e+07,0.135704,2.549765e+08,0.519020,1.878442e+08,2.362889e+08,6.861804e+07,465648.0,1485732.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100975,YRZOUH,2010,2.193145e+09,,6.433787e+08,6.427152e+08,0.293056,1.254352e+09,0.571942,6.109732e+08,9.387930e+08,6.486510e+08,663444.0,5272380.0,False
100948,ZDCJZM,2010,1.406831e+09,,8.421765e+07,8.370352e+07,0.059498,6.295050e+08,0.447463,5.452873e+08,7.773265e+08,8.884737e+07,514136.0,4629713.0,False
100912,ZRKOOR,2002,1.161208e+09,,9.529671e+07,9.461051e+07,0.081476,5.589661e+08,0.481366,4.636693e+08,6.022419e+08,1.045150e+08,686195.0,9218252.0,False
100920,ZRKOOR,2023,1.037629e+09,,1.823199e+08,1.819399e+08,0.175342,4.993368e+08,0.481229,3.170169e+08,5.382918e+08,1.918209e+08,379945.0,9501007.0,False


### Partial Dataset: Operating Expenses, Revenue, Net Income, Revenue Growth

In [73]:
partial_df.columns

Index(['Stock', 'Year', 'Revenue', 'Revenue Growth', 'Operating Income',
       'Net Income', 'Net Income Ratio', 'Gross Profit', 'Gross Profit Ratio',
       'Operating Expenses', 'Cost of Revenue', 'EBITDA', 'Interest Expense',
       'Depreciation & Amortization', 'is_out_of_range'],
      dtype='object')

In [74]:
partial_df[['Operating Expenses', 'Interest Expense', 'Net Income', 'Revenue Growth']]

Unnamed: 0,Operating Expenses,Interest Expense,Net Income,Revenue Growth
1017,3.083693e+08,284064.0,3.225423e+08,0.166164
1078,,170640.0,-8.238490e+07,0.125184
1088,,619332.0,3.232122e+07,0.018995
1103,,578007.0,1.331177e+08,0.184940
1117,,988900.0,-5.198303e+06,0.037804
...,...,...,...,...
100931,,482545.0,3.562950e+08,0.006929
100942,1.937443e+08,414404.0,6.073633e+07,0.190997
100945,1.869542e+08,768101.0,,0.183535
100961,3.630307e+08,195397.0,2.776071e+08,-0.005228


#### Operating Expenses = Gross Profit - Operating Income

In [75]:
# Fill NaN in 'Operating Expenses' only if both 'Gross Profit' and 'Operating Income' are available
partial_df['Operating Expenses'] = partial_df.apply(
    lambda row: row['Gross Profit'] - row['Operating Income']
    if pd.isna(row['Operating Expenses']) and pd.notna(row['Gross Profit']) and pd.notna(row['Operating Income'])
    else row['Operating Expenses'], axis=1
)

In [76]:
partial_df[['Operating Expenses', 'Gross Profit', 'Operating Income']]

Unnamed: 0,Operating Expenses,Gross Profit,Operating Income
1017,3.083693e+08,6.311956e+08,3.228263e+08
1078,8.151531e+08,7.329389e+08,-8.221426e+07
1088,7.326119e+07,1.062017e+08,3.294055e+07
1103,7.238342e+08,8.575299e+08,1.336957e+08
1117,2.322920e+08,2.280826e+08,-4.209403e+06
...,...,...,...
100931,4.963250e+08,8.531026e+08,3.567775e+08
100942,1.937443e+08,2.548950e+08,6.115073e+07
100945,1.869542e+08,2.680819e+08,8.112773e+07
100961,3.630307e+08,6.408332e+08,2.778025e+08


In [77]:
# Check if Operating Expenses still has NaN
if partial_df['Operating Expenses'].isna().any():
    print("Yes, there are still NaN values.")
    print(partial_df[partial_df['Operating Expenses'].isna()])
else:
    print("No more NaN values")

No more NaN values


In [78]:
print(missing_values_check(partial_df))

                         Column  No. of Missing Values  Percentage
2                       Revenue                   1599   19.994998
5                    Net Income                    800   10.003751
0                         Stock                      0    0.000000
1                          Year                      0    0.000000
3                Revenue Growth                      0    0.000000
4              Operating Income                      0    0.000000
6              Net Income Ratio                      0    0.000000
7                  Gross Profit                      0    0.000000
8            Gross Profit Ratio                      0    0.000000
9            Operating Expenses                      0    0.000000
10              Cost of Revenue                      0    0.000000
11                       EBITDA                      0    0.000000
12             Interest Expense                      0    0.000000
13  Depreciation & Amortization                      0    0.00

#### Net Income = Operating Income - Interest Expense

In [79]:
# Fill NaN in 'Net Income' only when Operating Income, Interest Expense are available
partial_df['Net Income'] = partial_df.apply(
    lambda row: row['Operating Income'] - row['Interest Expense']
    if pd.isna(row['Net Income']) and pd.notna(row['Operating Income']) and pd.notna(row['Interest Expense'])
    else row['Net Income'],
    axis=1
)

In [80]:
partial_df[['Net Income', 'Operating Income', 'Interest Expense']]

Unnamed: 0,Net Income,Operating Income,Interest Expense
1017,3.225423e+08,3.228263e+08,284064.0
1078,-8.238490e+07,-8.221426e+07,170640.0
1088,3.232122e+07,3.294055e+07,619332.0
1103,1.331177e+08,1.336957e+08,578007.0
1117,-5.198303e+06,-4.209403e+06,988900.0
...,...,...,...
100931,3.562950e+08,3.567775e+08,482545.0
100942,6.073633e+07,6.115073e+07,414404.0
100945,8.035963e+07,8.112773e+07,768101.0
100961,2.776071e+08,2.778025e+08,195397.0


In [81]:
# Check if Net Income still has NaN
if partial_df['Net Income'].isna().any():
    print("Yes, there are still NaN values.")
    print(partial_df[partial_df['Net Income'].isna()])
else:
    print("No more NaN values")

No more NaN values


In [82]:
print(missing_values_check(partial_df))

                         Column  No. of Missing Values  Percentage
2                       Revenue                   1599   19.994998
0                         Stock                      0    0.000000
1                          Year                      0    0.000000
3                Revenue Growth                      0    0.000000
4              Operating Income                      0    0.000000
5                    Net Income                      0    0.000000
6              Net Income Ratio                      0    0.000000
7                  Gross Profit                      0    0.000000
8            Gross Profit Ratio                      0    0.000000
9            Operating Expenses                      0    0.000000
10              Cost of Revenue                      0    0.000000
11                       EBITDA                      0    0.000000
12             Interest Expense                      0    0.000000
13  Depreciation & Amortization                      0    0.00

#### Revenue = Gross Profit + Cost of Revenue

In [83]:
# Fill NaN in 'Revenue' only when Gross Profit and Cost of Revenue are available
partial_df['Revenue'] = partial_df.apply(
    lambda row: row['Gross Profit'] + row['Cost of Revenue']
    if pd.isna(row['Revenue']) and pd.notna(row['Gross Profit']) and pd.notna(row['Cost of Revenue'])
    else row['Revenue'],
    axis=1
)

In [84]:
partial_df[['Revenue', 'Gross Profit', 'Cost of Revenue']]

Unnamed: 0,Revenue,Gross Profit,Cost of Revenue
1017,1.377147e+09,6.311956e+08,7.459519e+08
1078,2.047604e+09,7.329389e+08,1.314665e+09
1088,3.194701e+08,1.062017e+08,2.132684e+08
1103,2.243174e+09,8.575299e+08,1.385644e+09
1117,6.565935e+08,2.280826e+08,4.285109e+08
...,...,...,...
100931,1.540178e+09,8.531026e+08,6.870754e+08
100942,6.097996e+08,2.548950e+08,3.549046e+08
100945,6.532326e+08,2.680819e+08,3.851506e+08
100961,1.663794e+09,6.408332e+08,1.022961e+09


In [85]:
# Check if Revenue still has NaN values
if partial_df['Revenue'].isna().any():
    print("Yes, there are still NaN values.")
    print(partial_df[partial_df['Revenue'].isna()])
else:
    print("No more NaN values")

No more NaN values


In [64]:
print(missing_values_check(partial_df))

                         Column  No. of Missing Values  Percentage
3                Revenue Growth                     91    1.124429
0                         Stock                      0    0.000000
1                          Year                      0    0.000000
2                       Revenue                      0    0.000000
4              Operating Income                      0    0.000000
5                    Net Income                      0    0.000000
6              Net Income Ratio                      0    0.000000
7                  Gross Profit                      0    0.000000
8            Gross Profit Ratio                      0    0.000000
9            Operating Expenses                      0    0.000000
10              Cost of Revenue                      0    0.000000
11                       EBITDA                      0    0.000000
12             Interest Expense                      0    0.000000
13  Depreciation & Amortization                      0    0.00

#### Revenue Growth = % Change in Revenue from Previous Year

In [86]:
# For each stock, fill in revenue growth NaN based on revenue in the previous year
# Ensure 'Year' column is in integer format
partial_df['Year'] = partial_df['Year'].astype(int)

# Sort by stock and year
partial_df = partial_df.sort_values(by=['Stock', 'Year'])

# Calculate revenue growth where it is NaN
partial_df['Revenue Growth'] = partial_df.groupby('Stock')['Revenue'].apply(
    lambda x: x.pct_change().fillna(0)
).reset_index(level=0, drop=True)  # Reset the group index, drop the group level

In [87]:
partial_df[['Stock', 'Year', 'Revenue', 'Revenue Growth']]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth
60034,AAAR,2022,1.966766e+09,0.000000
52351,AAAR,2023,1.511845e+09,-0.231304
83513,AAGYLB,2022,2.604578e+07,0.000000
88180,AAGYLB,2023,1.729867e+09,65.416389
17404,AAHKPV,2022,6.635320e+08,0.000000
...,...,...,...,...
52984,ZZNTAM,2023,4.088261e+08,0.062723
76589,ZZYCDC,2022,9.501569e+08,0.000000
92549,ZZYCDC,2023,2.493206e+09,1.623994
70614,ZZZC,2022,2.231377e+07,0.000000


### Historical Dataset: Gross Profit, Cost of Revenue, Revenue

In [88]:
print(missing_values_check(historical_df))

            Column  No. of Missing Values  Percentage
3  Cost of Revenue                    129   26.166329
4     Gross Profit                    129   26.166329
2          Revenue                     86   17.444219
0            Stock                      0    0.000000
1             Year                      0    0.000000
5  is_out_of_range                      0    0.000000


#### Gross Profit = Revenue - Cost of Revenue

In [90]:
# Fill NaN in 'Gross Profit' only when Revenue and Cost of Revenue are available
historical_df['Gross Profit'] = historical_df.apply(
    lambda row: row['Revenue'] - row['Cost of Revenue']
    if pd.isna(row['Gross Profit']) and pd.notna(row['Revenue']) and pd.notna(row['Cost of Revenue'])
    else row['Gross Profit'],
    axis=1
)

In [91]:
print(missing_values_check(historical_df))

            Column  No. of Missing Values  Percentage
3  Cost of Revenue                    129   26.166329
4     Gross Profit                    129   26.166329
2          Revenue                     86   17.444219
0            Stock                      0    0.000000
1             Year                      0    0.000000
5  is_out_of_range                      0    0.000000


#### Cost of Revenue = Revenue - Gross Profit

In [92]:
# Fill NaN in 'Cost of Revenue' only when Revenue and Gross Profit are available
historical_df['Cost of Revenue'] = historical_df.apply(
    lambda row: row['Revenue'] - row['Gross Profit']
    if pd.isna(row['Cost of Revenue']) and pd.notna(row['Revenue']) and pd.notna(row['Gross Profit'])
    else row['Cost of Revenue'],
    axis=1
)

In [93]:
print(missing_values_check(historical_df))

            Column  No. of Missing Values  Percentage
3  Cost of Revenue                    129   26.166329
4     Gross Profit                    129   26.166329
2          Revenue                     86   17.444219
0            Stock                      0    0.000000
1             Year                      0    0.000000
5  is_out_of_range                      0    0.000000


#### Revenue = Cost of Revenue + Gross Profit

In [94]:
# Fill NaN in 'Revenue' only when Cost of Revenue and Gross Profit are available
historical_df['Revenue'] = historical_df.apply(
    lambda row: row['Cost of Revenue'] + row['Gross Profit']
    if pd.isna(row['Revenue']) and pd.notna(row['Cost of Revenue']) and pd.notna(row['Gross Profit'])
    else row['Revenue'],
    axis=1
)

In [95]:
print(missing_values_check(historical_df))

            Column  No. of Missing Values  Percentage
3  Cost of Revenue                    129   26.166329
4     Gross Profit                    129   26.166329
2          Revenue                     86   17.444219
0            Stock                      0    0.000000
1             Year                      0    0.000000
5  is_out_of_range                      0    0.000000


In [96]:
# Check the shape of Historical Dataset
historical_df.shape

(493, 6)

In [97]:
# Remove rows with no Revenue
historical_df = historical_df.drop(historical_df[historical_df['Revenue'].isna()].index)

#### Drop helper columns

In [98]:
expanded_df = expanded_df.drop(columns=['is_out_of_range'])

In [99]:
partial_df = partial_df.drop(columns=['is_out_of_range'])

In [100]:
historical_df = historical_df.drop(columns=['is_out_of_range'])

In [101]:
sparse_df = sparse_df.drop(columns=['is_out_of_range'])

### Checking Actual vs Expected value of columns

For each financial metric, the actual values might not follow the expected values.

For example, if EBITDA = Operating Income - Depreciation & Amortization, the actual value might not follow this formula. 

This could be because there are adjustments in real-world reporting or human error from inputting the incorrect value, which can result in them to be there by default.

Depending on how many rows are mismatched for each financial metric, we will decide whether to retain these mismatched rows for analysis later on.

#### Checking EBITDA (Operating Income - D&A)

In [103]:
def ebitda_calculation(df):
    # Calculate expected EBITDA
    df['expected_EBITDA'] = df['Operating Income'] + df['Depreciation & Amortization']
    # Compare expected EBITDA with the actual EBITDA, also considering small floating-point differences
    df['EBITDA_match'] = np.isclose(df['expected_EBITDA'], df['EBITDA'])

In [104]:
ebitda_calculation(expanded_df)

In [105]:
ebitda_calculation(partial_df)

In [106]:
# 768 out of 100905 rows in Expanded Dataset still do not match the expected EBITDA formula.
expanded_df[expanded_df['EBITDA_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,Cost of Revenue,EBITDA,Interest Expense,Depreciation & Amortization,expected_EBITDA,EBITDA_match
553,ABMD,2018,5.937490e+08,0.039003,1.571370e+08,1.121700e+08,0.188918,4.951680e+08,0.833969,3.380310e+08,9.858100e+07,1.644540e+08,0.0,1.100500e+07,1.681420e+08,False
552,ABMD,2019,7.694320e+08,0.316652,2.248120e+08,2.590160e+08,0.336633,6.398650e+08,0.831607,4.150530e+08,1.295670e+08,2.307670e+08,0.0,1.412100e+07,2.389330e+08,False
608,ACN,2020,4.432704e+10,0.595719,6.513644e+09,5.107839e+09,0.115231,1.397616e+10,0.315296,7.462514e+09,3.035088e+10,7.986088e+09,33071000.0,1.773124e+09,8.286768e+09,False
607,ACN,2021,5.053339e+10,-0.024285,7.621529e+09,5.906809e+09,0.116889,1.636413e+10,0.323828,8.742599e+09,3.416926e+10,7.621529e+09,59492000.0,1.891242e+09,9.512771e+09,False
606,ACN,2022,6.159430e+10,0.161534,9.367181e+09,6.877169e+09,0.111653,1.970154e+10,0.319860,1.033436e+10,4.189277e+10,1.055422e+10,47320000.0,1.310738e+09,1.067792e+10,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483,ZBRA,2019,4.485000e+09,0.062035,6.920000e+08,5.440000e+08,0.121293,2.100000e+09,0.468227,1.376000e+09,2.385000e+09,8.430000e+08,89000000.0,1.750000e+08,8.670000e+08,False
482,ZBRA,2020,4.448000e+09,-0.231833,6.510000e+08,5.040000e+08,0.113309,2.003000e+09,0.450315,1.318000e+09,2.445000e+09,7.360000e+08,76000000.0,1.460000e+08,7.970000e+08,False
481,ZBRA,2021,5.627000e+09,4.777891,9.790000e+08,8.370000e+08,0.148747,2.628000e+09,0.467034,1.617000e+09,2.999000e+09,1.173000e+09,5000000.0,1.870000e+08,1.166000e+09,False
480,ZBRA,2022,5.781000e+09,-0.838209,1.346000e+09,4.630000e+08,0.080090,2.624000e+09,0.453901,1.688000e+09,3.157000e+09,1.140000e+09,23000000.0,2.040000e+08,1.550000e+09,False


In [109]:
# All rows in Partial Dataset match the expected EBITDA formula
partial_df[partial_df['EBITDA_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,Cost of Revenue,EBITDA,Interest Expense,Depreciation & Amortization,expected_EBITDA,EBITDA_match


#### Checking Net Income (Operating Income - Interest Expense)

In [110]:
def netincome_calculation(df):
    # Calculate expected net income
    df['expected_netincome'] = df['Operating Income'] - df['Interest Expense']
    # Compare expected net income with the actual net income, also considering small floating-point differences
    df['netincome_match'] = np.isclose(df['expected_netincome'], df['Net Income'])

In [111]:
netincome_calculation(expanded_df)

In [113]:
netincome_calculation(partial_df)

In [114]:
# 905 out of 100905 rows in Expanded Dataset still do not match the expected Net Income formula
expanded_df[expanded_df['netincome_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,Cost of Revenue,EBITDA,Interest Expense,Depreciation & Amortization,expected_EBITDA,EBITDA_match,expected_netincome,netincome_match
553,ABMD,2018,5.937490e+08,0.039003,1.571370e+08,112170000.0,0.188918,4.951680e+08,0.833969,3.380310e+08,9.858100e+07,1.644540e+08,0.0,11005000.0,1.681420e+08,False,1.571370e+08,False
552,ABMD,2019,7.694320e+08,0.316652,2.248120e+08,259016000.0,0.336633,6.398650e+08,0.831607,4.150530e+08,1.295670e+08,2.307670e+08,0.0,14121000.0,2.389330e+08,False,2.248120e+08,False
551,ABMD,2020,8.408830e+08,-0.234162,2.492190e+08,203009000.0,0.241424,6.895780e+08,0.820064,4.403590e+08,1.513050e+08,2.574820e+08,0.0,8263000.0,2.574820e+08,True,2.492190e+08,False
550,ABMD,2021,8.475220e+08,0.000000,2.295570e+08,225525000.0,0.266099,6.856150e+08,0.808964,4.560580e+08,1.619070e+08,2.469370e+08,0.0,17380000.0,2.469370e+08,True,2.295570e+08,False
549,ABMD,2022,1.031753e+09,3.131401,3.726920e+08,136505000.0,0.132304,8.435950e+08,0.817633,5.868890e+08,1.881580e+08,4.007810e+08,49840000.0,28089000.0,4.007810e+08,True,3.228520e+08,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483,ZBRA,2019,4.485000e+09,0.062035,6.920000e+08,544000000.0,0.121293,2.100000e+09,0.468227,1.376000e+09,2.385000e+09,8.430000e+08,89000000.0,175000000.0,8.670000e+08,False,6.030000e+08,False
482,ZBRA,2020,4.448000e+09,-0.231833,6.510000e+08,504000000.0,0.113309,2.003000e+09,0.450315,1.318000e+09,2.445000e+09,7.360000e+08,76000000.0,146000000.0,7.970000e+08,False,5.750000e+08,False
481,ZBRA,2021,5.627000e+09,4.777891,9.790000e+08,837000000.0,0.148747,2.628000e+09,0.467034,1.617000e+09,2.999000e+09,1.173000e+09,5000000.0,187000000.0,1.166000e+09,False,9.740000e+08,False
480,ZBRA,2022,5.781000e+09,-0.838209,1.346000e+09,463000000.0,0.080090,2.624000e+09,0.453901,1.688000e+09,3.157000e+09,1.140000e+09,23000000.0,204000000.0,1.550000e+09,False,1.323000e+09,False


In [115]:
# All rows in Partial Dataset match the expected Net Income formula
partial_df[partial_df['netincome_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,Cost of Revenue,EBITDA,Interest Expense,Depreciation & Amortization,expected_EBITDA,EBITDA_match,expected_netincome,netincome_match


#### Checking Gross Profit (Revenue - Cost of Revenue)

In [116]:
def grossprofit_calculation(df):
    # Calculate expected gross profit
    df['expected_grossprofit'] = df['Revenue'] - df['Cost of Revenue']
    
    # Compare expected gross profit with the actual gross profit, considering small floating-point differences
    df['grossprofit_match'] = np.isclose(df['expected_grossprofit'], df['Gross Profit'])

In [117]:
grossprofit_calculation(expanded_df)

In [118]:
grossprofit_calculation(partial_df)

In [119]:
# 5 out of 100905 rows in Expanded Dataset still do not match the expected Gross Profit formula
expanded_df[expanded_df['grossprofit_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,Cost of Revenue,EBITDA,Interest Expense,Depreciation & Amortization,expected_EBITDA,EBITDA_match,expected_netincome,netincome_match,expected_grossprofit,grossprofit_match
833,CI,2019,153743000000.0,-0.356192,8077000000.0,5104000000.0,0.033198,0.0,0.799542,145666000000.0,0.0,10916000000.0,1682000000.0,3651000000.0,11728000000.0,False,6395000000.0,False,153743000000.0,False
832,CI,2020,160550000000.0,3.946164,8153000000.0,8458000000.0,0.052673,0.0,0.796297,152397000000.0,0.0,14477000000.0,1438000000.0,2802000000.0,10955000000.0,False,6715000000.0,False,160550000000.0,False
831,CI,2021,174267000000.0,-0.255454,7941000000.0,5370000000.0,0.030814,0.0,0.807416,166326000000.0,0.0,10413000000.0,1208000000.0,2923000000.0,10864000000.0,False,6733000000.0,False,174267000000.0,False
830,CI,2022,180031000000.0,-0.043143,8450000000.0,6704000000.0,0.037112,0.0,0.821714,171581000000.0,0.0,12908000000.0,1228000000.0,2925000000.0,11375000000.0,False,7222000000.0,False,180031000000.0,False
829,CI,2023,195187000000.0,-0.604405,8536000000.0,5164000000.0,0.026457,0.0,0.814491,186651000000.0,0.0,9574000000.0,1446000000.0,3035000000.0,11571000000.0,False,7090000000.0,False,195187000000.0,False


In [120]:
# All rows in Partial Dataset does not match
partial_df[partial_df['grossprofit_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,Cost of Revenue,EBITDA,Interest Expense,Depreciation & Amortization,expected_EBITDA,EBITDA_match,expected_netincome,netincome_match,expected_grossprofit,grossprofit_match


#### Checking Net Income Ratio (Net Income / Revenue)

In [121]:
def netincomeratio_calculation(df):
    # Calculate expected net income ratio
    df['expected_netincome_ratio'] = df['Net Income'] / df['Revenue']
    
    # Handle the case where Revenue is zero to avoid division by zero
    df['expected_netincome_ratio'] = df['expected_netincome_ratio'].where(df['Revenue'] != 0, 0)
    
    # Compare expected net income ratio with the actual net income ratio, considering small floating-point differences
    df['netincomeratio_match'] = np.isclose(df['expected_netincome_ratio'], df['Net Income Ratio'])

In [122]:
netincomeratio_calculation(expanded_df)

In [123]:
netincomeratio_calculation(partial_df)

In [124]:
# 10 rows out of 100905 in Expanded Dataset still do not match the expected Net Income Ratio
expanded_df[expanded_df['netincomeratio_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,Interest Expense,Depreciation & Amortization,expected_EBITDA,EBITDA_match,expected_netincome,netincome_match,expected_grossprofit,grossprofit_match,expected_netincome_ratio,netincomeratio_match
91,BAC-PL,2023,95787000000.0,-0.437592,39500000000.0,26515000000.0,0.268967,95787000000.0,1.0,56287000000.0,...,73331000000.0,2057000000.0,41557000000.0,False,-33831000000.0,False,95787000000.0,True,0.276812,False
832,CI,2020,160550000000.0,3.946164,8153000000.0,8458000000.0,0.052673,0.0,0.796297,152397000000.0,...,1438000000.0,2802000000.0,10955000000.0,False,6715000000.0,False,160550000000.0,False,0.052681,False
831,CI,2021,174267000000.0,-0.255454,7941000000.0,5370000000.0,0.030814,0.0,0.807416,166326000000.0,...,1208000000.0,2923000000.0,10864000000.0,False,6733000000.0,False,174267000000.0,False,0.030815,False
830,CI,2022,180031000000.0,-0.043143,8450000000.0,6704000000.0,0.037112,0.0,0.821714,171581000000.0,...,1228000000.0,2925000000.0,11375000000.0,False,7222000000.0,False,180031000000.0,False,0.037238,False
998,JPM,2020,119883000000.0,-0.881334,41756000000.0,29131000000.0,0.284285,119883000000.0,1.0,66656000000.0,...,9960000000.0,8614000000.0,50370000000.0,True,31796000000.0,False,119883000000.0,True,0.242995,False
997,JPM,2021,121685000000.0,17.462388,65024000000.0,48334000000.0,0.369249,121685000000.0,1.0,71336000000.0,...,5553000000.0,7932000000.0,72956000000.0,True,59471000000.0,False,121685000000.0,True,0.397206,False
996,JPM,2022,128641000000.0,-0.929791,52523000000.0,37676000000.0,0.308067,128641000000.0,1.0,76140000000.0,...,26097000000.0,7051000000.0,59574000000.0,True,26426000000.0,False,128641000000.0,True,0.292877,False
860,ROK,2023,9058000000.0,0.096858,1622000000.0,1387400000.0,0.141091,3717000000.0,0.410355,2023700000.0,...,135300000.0,250400000.0,1872400000.0,False,1486700000.0,False,3717000000.0,True,0.153168,False
941,SAP,2021,27840000000.0,0.759336,2885000000.0,5256000000.0,0.195006,19894000000.0,0.714583,15083000000.0,...,949000000.0,1775000000.0,4660000000.0,False,1936000000.0,False,19894000000.0,True,0.188793,False
940,SAP,2022,30872000000.0,-0.457514,4672000000.0,2284000000.0,0.077374,21936000000.0,0.710547,17125000000.0,...,2205000000.0,1569000000.0,6241000000.0,False,2467000000.0,False,21936000000.0,True,0.073983,False


In [125]:
# All rows in Partial Dataset match the expected Net Income Ratio formula
partial_df[partial_df['netincomeratio_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,Interest Expense,Depreciation & Amortization,expected_EBITDA,EBITDA_match,expected_netincome,netincome_match,expected_grossprofit,grossprofit_match,expected_netincome_ratio,netincomeratio_match


#### Checking Gross Profit Ratio (Gross Profit / Revenue)

In [126]:
def grossprofitratio_calculation(df):
    # Calculate expected gross profit ratio
    df['expected_grossprofit_ratio'] = df['Gross Profit'] / df['Revenue']
    
    # Handle the case where Revenue is zero to avoid division by zero
    df['expected_grossprofit_ratio'] = df['expected_grossprofit_ratio'].where(df['Revenue'] != 0, 0)
    
    # Compare expected gross profit ratio with the actual gross profit ratio, considering small floating-point differences
    df['grossprofitratio_match'] = np.isclose(df['expected_grossprofit_ratio'], df['Gross Profit Ratio'])

In [127]:
grossprofitratio_calculation(expanded_df)

In [128]:
grossprofitratio_calculation(partial_df)

In [129]:
# 15 rows out of 100905 in Expanded Dataset still do not match the expected Gross Profit Ratio
expanded_df[expanded_df['grossprofitratio_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_EBITDA,EBITDA_match,expected_netincome,netincome_match,expected_grossprofit,grossprofit_match,expected_netincome_ratio,netincomeratio_match,expected_grossprofit_ratio,grossprofitratio_match
359,ANTM,2020,121867000000.0,0.01224,7711000000.0,4572000000.0,0.037516,33822000000.0,0.0,26111000000.0,...,8865000000.0,False,6927000000.0,False,33822000000.0,True,0.037516,True,0.277532,False
358,ANTM,2021,138639000000.0,0.040871,8946000000.0,6104000000.0,0.044028,35994000000.0,0.0,27048000000.0,...,10248000000.0,False,8148000000.0,False,35994000000.0,True,0.044028,True,0.259624,False
357,ANTM,2022,156595000000.0,0.218883,8451000000.0,5894000000.0,0.037638,39953000000.0,0.0,31502000000.0,...,10126000000.0,True,7600000000.0,False,39953000000.0,True,0.037638,True,0.255136,False
356,ANTM,2023,171340000000.0,0.140013,8745000000.0,5987000000.0,0.034942,47010000000.0,0.899072,38265000000.0,...,10490000000.0,True,7715000000.0,False,47010000000.0,True,0.034942,True,0.274367,False
833,CI,2019,153743000000.0,-0.356192,8077000000.0,5104000000.0,0.033198,0.0,0.799542,145666000000.0,...,11728000000.0,False,6395000000.0,False,153743000000.0,False,0.033198,True,0.0,False
832,CI,2020,160550000000.0,3.946164,8153000000.0,8458000000.0,0.052673,0.0,0.796297,152397000000.0,...,10955000000.0,False,6715000000.0,False,160550000000.0,False,0.052681,False,0.0,False
831,CI,2021,174267000000.0,-0.255454,7941000000.0,5370000000.0,0.030814,0.0,0.807416,166326000000.0,...,10864000000.0,False,6733000000.0,False,174267000000.0,False,0.030815,False,0.0,False
830,CI,2022,180031000000.0,-0.043143,8450000000.0,6704000000.0,0.037112,0.0,0.821714,171581000000.0,...,11375000000.0,False,7222000000.0,False,180031000000.0,False,0.037238,False,0.0,False
829,CI,2023,195187000000.0,-0.604405,8536000000.0,5164000000.0,0.026457,0.0,0.814491,186651000000.0,...,11571000000.0,False,7090000000.0,False,195187000000.0,False,0.026457,True,0.0,False
523,ELV,2020,121867000000.0,0.263707,7711000000.0,4572000000.0,0.037516,33822000000.0,0.0,26111000000.0,...,8865000000.0,False,6927000000.0,False,33822000000.0,True,0.037516,True,0.277532,False


In [130]:
# All rows in Partial Dataset match
partial_df[partial_df['grossprofitratio_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_EBITDA,EBITDA_match,expected_netincome,netincome_match,expected_grossprofit,grossprofit_match,expected_netincome_ratio,netincomeratio_match,expected_grossprofit_ratio,grossprofitratio_match


#### Checking Revenue (Gross Profit + Cost of Revenue)

In [131]:
def revenue_calculation(df):
    # Calculate expected revenue
    df['expected_revenue'] = df['Gross Profit'] + df['Cost of Revenue']

    # Compare expected revenue with the actual revenue, considering small floating-point differences
    df['revenue_match'] = np.isclose(df['expected_revenue'], df['Revenue'])

In [132]:
revenue_calculation(expanded_df)

In [133]:
revenue_calculation(partial_df)

In [134]:
revenue_calculation(historical_df)

In [135]:
# 5 out of 100905 rows in Expanded Dataset still do not match
expanded_df[expanded_df['revenue_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_netincome,netincome_match,expected_grossprofit,grossprofit_match,expected_netincome_ratio,netincomeratio_match,expected_grossprofit_ratio,grossprofitratio_match,expected_revenue,revenue_match
833,CI,2019,153743000000.0,-0.356192,8077000000.0,5104000000.0,0.033198,0.0,0.799542,145666000000.0,...,6395000000.0,False,153743000000.0,False,0.033198,True,0.0,False,0.0,False
832,CI,2020,160550000000.0,3.946164,8153000000.0,8458000000.0,0.052673,0.0,0.796297,152397000000.0,...,6715000000.0,False,160550000000.0,False,0.052681,False,0.0,False,0.0,False
831,CI,2021,174267000000.0,-0.255454,7941000000.0,5370000000.0,0.030814,0.0,0.807416,166326000000.0,...,6733000000.0,False,174267000000.0,False,0.030815,False,0.0,False,0.0,False
830,CI,2022,180031000000.0,-0.043143,8450000000.0,6704000000.0,0.037112,0.0,0.821714,171581000000.0,...,7222000000.0,False,180031000000.0,False,0.037238,False,0.0,False,0.0,False
829,CI,2023,195187000000.0,-0.604405,8536000000.0,5164000000.0,0.026457,0.0,0.814491,186651000000.0,...,7090000000.0,False,195187000000.0,False,0.026457,True,0.0,False,0.0,False


In [136]:
# All rows in Partial Dataset match
partial_df[partial_df['revenue_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_netincome,netincome_match,expected_grossprofit,grossprofit_match,expected_netincome_ratio,netincomeratio_match,expected_grossprofit_ratio,grossprofitratio_match,expected_revenue,revenue_match


In [141]:
# 43 out of 407 rows in Historical Dataset still do not match
# But we keep the 43 rows anyway because the mismatch is actually caused by NaN in the other columns
historical_df[historical_df['revenue_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Cost of Revenue,Gross Profit,expected_revenue,revenue_match
121,YLVJB,2020,1064303000.0,,,,False
228,UOSVNX,2014,1709739000.0,,,,False
1332,OFVZT,2010,472967800.0,,,,False
1355,AAZLH,2019,1205587000.0,,,,False
1383,UZLPI,2011,15564450.0,,,,False
1562,OXDRI,2019,1951711000.0,,,,False
1623,YRMTOC,2015,986099300.0,,,,False
2027,PTFPE,2012,275783000.0,,,,False
2476,SKRHFG,2012,1451122000.0,,,,False
2509,SHDBKH,2018,440356000.0,,,,False


#### Checking Revenue Growth (% Change in Revenue from Previous Year)

In [142]:
def revenuegrowth_calculation(df):
    # Sort Stock and Year first
    df = df.sort_values(by=['Stock', 'Year'])
    
    # Calculate expected revenue growth
    df['expected_revenuegrowth'] = df.groupby('Stock')['Revenue'].pct_change()

    # Fill NaN values in expected_revenuegrowth (e.g., first year or missing data)
    df['expected_revenuegrowth'] = df['expected_revenuegrowth'].fillna(0)

    # Compare expected revenue growth with the actual revenue growth, considering small floating-point differences
    df['revenuegrowth_match'] = np.isclose(df['expected_revenuegrowth'], df['Revenue Growth'], atol=1e-2)

    return df

In [143]:
expanded_df = revenuegrowth_calculation(expanded_df)

In [144]:
partial_df = revenuegrowth_calculation(partial_df)

In [145]:
# 100042 out of 100905 rows in Expanded Dataset still do not match the expected Revenue Growth
expanded_df[expanded_df['revenuegrowth_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_grossprofit,grossprofit_match,expected_netincome_ratio,netincomeratio_match,expected_grossprofit_ratio,grossprofitratio_match,expected_revenue,revenue_match,expected_revenuegrowth,revenuegrowth_match
74474,AAAR,2000,1.226394e+09,0.515833,3.822607e+08,3.821538e+08,0.311608,7.326731e+08,0.597421,3.504124e+08,...,7.326731e+08,True,0.311608,True,0.597421,True,1.226394e+09,True,0.000000,False
30360,AAAR,2001,1.363043e+09,0.251834,2.826937e+08,2.823476e+08,0.207145,5.723752e+08,0.419924,2.896814e+08,...,5.723752e+08,True,0.207145,True,0.419924,True,1.363043e+09,True,0.111424,False
57233,AAAR,2002,9.680363e+08,1.469854,1.493846e+08,1.489654e+08,0.153884,4.224707e+08,0.436420,2.730861e+08,...,4.224707e+08,True,0.153884,True,0.436420,True,9.680363e+08,True,-0.289798,False
38752,AAAR,2003,7.725786e+08,0.026633,8.830040e+07,8.753229e+07,0.113299,3.959663e+08,0.512526,3.076659e+08,...,3.959663e+08,True,0.113299,True,0.512526,True,7.725786e+08,True,-0.201912,False
87263,AAAR,2004,9.481924e+07,2.047161,1.385745e+07,1.312690e+07,0.138441,4.645290e+07,0.489910,3.259545e+07,...,4.645290e+07,True,0.138441,True,0.489910,True,9.481924e+07,True,-0.877269,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25480,ZZZC,2020,2.068409e+09,0.236733,6.084053e+08,6.074186e+08,0.293665,1.172125e+09,0.566679,5.637193e+08,...,1.172125e+09,True,0.293665,True,0.566679,True,2.068409e+09,True,0.876464,False
51748,ZZZC,2021,1.044180e+09,4.249838,1.926045e+08,1.923814e+08,0.184242,4.346634e+08,0.416272,2.420589e+08,...,4.346634e+08,True,0.184242,True,0.416272,True,1.044180e+09,True,-0.495177,False
70614,ZZZC,2022,2.231377e+07,0.000000,6.756885e+06,5.839832e+06,0.261714,1.311786e+07,0.587882,6.360976e+06,...,1.311786e+07,True,0.261714,True,0.587882,True,2.231377e+07,True,-0.978630,False
38712,ZZZC,2023,1.388236e+09,0.635764,2.644288e+08,2.641767e+08,0.190297,6.771341e+08,0.487766,4.127053e+08,...,6.771341e+08,True,0.190297,True,0.487766,True,1.388236e+09,True,61.214298,False


In [146]:
# We keep these rows but replace the original Revenue Growth with the expected numbers
expanded_df['Revenue Growth'] = expanded_df['expected_revenuegrowth']

In [147]:
# All rows in Partial Dataset match
partial_df[partial_df['revenuegrowth_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_grossprofit,grossprofit_match,expected_netincome_ratio,netincomeratio_match,expected_grossprofit_ratio,grossprofitratio_match,expected_revenue,revenue_match,expected_revenuegrowth,revenuegrowth_match


#### Checking Cost of Revenue (Revenue - Gross Profit)

In [148]:
def costofrevenue_calculation(df):
    # Calculate expected cost of revenue
    df['expected_costofrevenue'] = df['Revenue'] - df['Gross Profit']

    # Compare expected cost of revenue with the actual cost of revenue, considering small floating-point differences
    df['costofrevenue_match'] = np.isclose(df['expected_costofrevenue'], df['Cost of Revenue'])

In [149]:
costofrevenue_calculation(expanded_df)

In [150]:
costofrevenue_calculation(partial_df)

In [151]:
# 5 out of 100905 rows in Expanded Dataset still do not match
expanded_df[expanded_df['costofrevenue_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_netincome_ratio,netincomeratio_match,expected_grossprofit_ratio,grossprofitratio_match,expected_revenue,revenue_match,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match
833,CI,2019,153743000000.0,0.0,8077000000.0,5104000000.0,0.033198,0.0,0.799542,145666000000.0,...,0.033198,True,0.0,False,0.0,False,0.0,False,153743000000.0,False
832,CI,2020,160550000000.0,0.044275,8153000000.0,8458000000.0,0.052673,0.0,0.796297,152397000000.0,...,0.052681,False,0.0,False,0.0,False,0.044275,False,160550000000.0,False
831,CI,2021,174267000000.0,0.085438,7941000000.0,5370000000.0,0.030814,0.0,0.807416,166326000000.0,...,0.030815,False,0.0,False,0.0,False,0.085438,False,174267000000.0,False
830,CI,2022,180031000000.0,0.033076,8450000000.0,6704000000.0,0.037112,0.0,0.821714,171581000000.0,...,0.037238,False,0.0,False,0.0,False,0.033076,False,180031000000.0,False
829,CI,2023,195187000000.0,0.084186,8536000000.0,5164000000.0,0.026457,0.0,0.814491,186651000000.0,...,0.026457,True,0.0,False,0.0,False,0.084186,False,195187000000.0,False


In [152]:
# All rows in Partial Dataset match
partial_df[partial_df['costofrevenue_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_netincome_ratio,netincomeratio_match,expected_grossprofit_ratio,grossprofitratio_match,expected_revenue,revenue_match,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match


#### Checking Operating Income (Gross Profit - Operating Expense)

In [153]:
def operatingincome_calculation(df):
    # Calculate expected operating income
    df['expected_operatingincome'] = df['Gross Profit'] - df['Operating Expenses']
    
    # Compare expected operating income with the actual operating income, considering small floating-point differences
    df['operatingincome_match'] = np.isclose(df['expected_operatingincome'], df['Operating Income'])

In [154]:
operatingincome_calculation(expanded_df)

In [155]:
operatingincome_calculation(partial_df)

In [156]:
# 440 out of 100905 rows in Expanded Dataset still do not match
expanded_df[expanded_df['operatingincome_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_grossprofit_ratio,grossprofitratio_match,expected_revenue,revenue_match,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match,expected_operatingincome,operatingincome_match
549,ABMD,2022,1.031753e+09,0.217376,3.726920e+08,1.365050e+08,0.132304,8.435950e+08,0.817633,5.868890e+08,...,0.817633,True,1.031753e+09,True,0.217376,False,1.881580e+08,True,2.567060e+08,False
824,ADP,2024,1.920260e+10,0.066089,4.977200e+09,3.752000e+09,0.195390,8.725900e+09,0.454412,4.204400e+09,...,0.454412,True,1.920260e+10,True,0.066089,False,1.047670e+10,True,4.521500e+09,False
813,ADSK,2020,3.274300e+09,0.000000,3.430000e+08,2.145000e+08,0.065510,2.949400e+09,0.900773,2.605900e+09,...,0.900773,True,3.274300e+09,True,0.000000,False,3.249000e+08,True,3.435000e+08,False
809,ADSK,2024,5.440000e+09,0.086913,1.128000e+09,9.060000e+08,0.166544,4.929000e+09,0.906066,3.816000e+09,...,0.906066,True,5.440000e+09,True,0.086913,False,5.110000e+08,True,1.113000e+09,False
883,AJG,2019,7.195000e+09,0.000000,8.365000e+08,6.688000e+08,0.092953,2.364100e+09,0.328575,1.542900e+09,...,0.328575,True,7.195000e+09,True,0.000000,False,4.830900e+09,True,8.212000e+08,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
775,WTW,2022,8.866000e+09,-0.014670,1.178000e+09,1.009000e+09,0.113806,3.620000e+09,0.408301,2.343000e+09,...,0.408301,True,8.866000e+09,True,-0.014670,False,5.246000e+09,True,1.277000e+09,False
483,ZBRA,2019,4.485000e+09,0.000000,6.920000e+08,5.440000e+08,0.121293,2.100000e+09,0.468227,1.376000e+09,...,0.468227,True,4.485000e+09,True,0.000000,False,2.385000e+09,True,7.240000e+08,False
482,ZBRA,2020,4.448000e+09,-0.008250,6.510000e+08,5.040000e+08,0.113309,2.003000e+09,0.450315,1.318000e+09,...,0.450315,True,4.448000e+09,True,-0.008250,False,2.445000e+09,True,6.850000e+08,False
481,ZBRA,2021,5.627000e+09,0.265063,9.790000e+08,8.370000e+08,0.148747,2.628000e+09,0.467034,1.617000e+09,...,0.467034,True,5.627000e+09,True,0.265063,False,2.999000e+09,True,1.011000e+09,False


In [157]:
# All rows in Partial Dataset match
partial_df[partial_df['operatingincome_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_grossprofit_ratio,grossprofitratio_match,expected_revenue,revenue_match,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match,expected_operatingincome,operatingincome_match


#### Checking Operating Expenses (Gross Profit - Operating Income)

In [158]:
def operatingexpenses_calculation(df):
    # Calculate expected operating expenses
    df['expected_operatingexpenses'] = df['Gross Profit'] - df['Operating Income']
    
    # Compare expected operating expenses with the actual operating expense, considering small floating-point differences
    df['operatingexpenses_match'] = np.isclose(df['expected_operatingexpenses'], df['Operating Expenses'])

In [159]:
operatingexpenses_calculation(expanded_df)

In [160]:
operatingexpenses_calculation(partial_df)

In [161]:
# 440 out of 100905 in Expanded Dataset still do not match the expected Operating Expenses
expanded_df[expanded_df['operatingexpenses_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_revenue,revenue_match,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match,expected_operatingincome,operatingincome_match,expected_operatingexpenses,operatingexpenses_match
549,ABMD,2022,1.031753e+09,0.217376,3.726920e+08,1.365050e+08,0.132304,8.435950e+08,0.817633,5.868890e+08,...,1.031753e+09,True,0.217376,False,1.881580e+08,True,2.567060e+08,False,4.709030e+08,False
824,ADP,2024,1.920260e+10,0.066089,4.977200e+09,3.752000e+09,0.195390,8.725900e+09,0.454412,4.204400e+09,...,1.920260e+10,True,0.066089,False,1.047670e+10,True,4.521500e+09,False,3.748700e+09,False
813,ADSK,2020,3.274300e+09,0.000000,3.430000e+08,2.145000e+08,0.065510,2.949400e+09,0.900773,2.605900e+09,...,3.274300e+09,True,0.000000,False,3.249000e+08,True,3.435000e+08,False,2.606400e+09,False
809,ADSK,2024,5.440000e+09,0.086913,1.128000e+09,9.060000e+08,0.166544,4.929000e+09,0.906066,3.816000e+09,...,5.440000e+09,True,0.086913,False,5.110000e+08,True,1.113000e+09,False,3.801000e+09,False
883,AJG,2019,7.195000e+09,0.000000,8.365000e+08,6.688000e+08,0.092953,2.364100e+09,0.328575,1.542900e+09,...,7.195000e+09,True,0.000000,False,4.830900e+09,True,8.212000e+08,False,1.527600e+09,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
775,WTW,2022,8.866000e+09,-0.014670,1.178000e+09,1.009000e+09,0.113806,3.620000e+09,0.408301,2.343000e+09,...,8.866000e+09,True,-0.014670,False,5.246000e+09,True,1.277000e+09,False,2.442000e+09,False
483,ZBRA,2019,4.485000e+09,0.000000,6.920000e+08,5.440000e+08,0.121293,2.100000e+09,0.468227,1.376000e+09,...,4.485000e+09,True,0.000000,False,2.385000e+09,True,7.240000e+08,False,1.408000e+09,False
482,ZBRA,2020,4.448000e+09,-0.008250,6.510000e+08,5.040000e+08,0.113309,2.003000e+09,0.450315,1.318000e+09,...,4.448000e+09,True,-0.008250,False,2.445000e+09,True,6.850000e+08,False,1.352000e+09,False
481,ZBRA,2021,5.627000e+09,0.265063,9.790000e+08,8.370000e+08,0.148747,2.628000e+09,0.467034,1.617000e+09,...,5.627000e+09,True,0.265063,False,2.999000e+09,True,1.011000e+09,False,1.649000e+09,False


In [163]:
# All rows in Partial Dataset match
partial_df[partial_df['operatingexpenses_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_revenue,revenue_match,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match,expected_operatingincome,operatingincome_match,expected_operatingexpenses,operatingexpenses_match


#### Checking Depreciation & Amortization (EBITDA - Operating Income)

In [164]:
def depreciationamortization_calculation(df):
    # Calculate expected D&A
    df['expected_depreciationamortization'] = df['EBITDA'] - df['Operating Income']
    
    # Compare expected D&A with the actual D&A, considering small floating-point differences
    df['depreciationamortization_match'] = np.isclose(df['expected_depreciationamortization'], df['Depreciation & Amortization'])

In [165]:
depreciationamortization_calculation(expanded_df)

In [166]:
depreciationamortization_calculation(partial_df)

In [167]:
# 768 rows out of 100905 in Expanded Dataset still do not match the expected D&A
expanded_df[expanded_df['depreciationamortization_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match,expected_operatingincome,operatingincome_match,expected_operatingexpenses,operatingexpenses_match,expected_depreciationamortization,depreciationamortization_match
553,ABMD,2018,5.937490e+08,0.000000,1.571370e+08,1.121700e+08,0.188918,4.951680e+08,0.833969,3.380310e+08,...,0.000000,False,9.858100e+07,True,1.571370e+08,True,3.380310e+08,True,7.317000e+06,False
552,ABMD,2019,7.694320e+08,0.295888,2.248120e+08,2.590160e+08,0.336633,6.398650e+08,0.831607,4.150530e+08,...,0.295888,False,1.295670e+08,True,2.248120e+08,True,4.150530e+08,True,5.955000e+06,False
608,ACN,2020,4.432704e+10,0.000000,6.513644e+09,5.107839e+09,0.115231,1.397616e+10,0.315296,7.462514e+09,...,0.000000,False,3.035088e+10,True,6.513644e+09,True,7.462514e+09,True,1.472444e+09,False
607,ACN,2021,5.053339e+10,0.140013,7.621529e+09,5.906809e+09,0.116889,1.636413e+10,0.323828,8.742599e+09,...,0.140013,False,3.416926e+10,True,7.621529e+09,True,8.742599e+09,True,0.000000e+00,False
606,ACN,2022,6.159430e+10,0.218883,9.367181e+09,6.877169e+09,0.111653,1.970154e+10,0.319860,1.033436e+10,...,0.218883,False,4.189277e+10,True,9.367181e+09,True,1.033436e+10,True,1.187044e+09,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483,ZBRA,2019,4.485000e+09,0.000000,6.920000e+08,5.440000e+08,0.121293,2.100000e+09,0.468227,1.376000e+09,...,0.000000,False,2.385000e+09,True,7.240000e+08,False,1.408000e+09,False,1.510000e+08,False
482,ZBRA,2020,4.448000e+09,-0.008250,6.510000e+08,5.040000e+08,0.113309,2.003000e+09,0.450315,1.318000e+09,...,-0.008250,False,2.445000e+09,True,6.850000e+08,False,1.352000e+09,False,8.500000e+07,False
481,ZBRA,2021,5.627000e+09,0.265063,9.790000e+08,8.370000e+08,0.148747,2.628000e+09,0.467034,1.617000e+09,...,0.265063,False,2.999000e+09,True,1.011000e+09,False,1.649000e+09,False,1.940000e+08,False
480,ZBRA,2022,5.781000e+09,0.027368,1.346000e+09,4.630000e+08,0.080090,2.624000e+09,0.453901,1.688000e+09,...,0.027368,False,3.157000e+09,True,9.360000e+08,False,1.278000e+09,False,-2.060000e+08,False


In [168]:
# All rows in Partial Dataset match
partial_df[partial_df['depreciationamortization_match'] == False]

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match,expected_operatingincome,operatingincome_match,expected_operatingexpenses,operatingexpenses_match,expected_depreciationamortization,depreciationamortization_match


#### Dropping rows with 'False' in any of Match columns

In [169]:
# Expanded Dataset - All match columns except for expected revenue growth match
expanded_df_match_columns = [col for col in expanded_df.columns if 'match' in col and col != 'revenuegrowth_match']

# Filter rows where any of the match columns have False values - 905 rows
expanded_df_rows_with_false = expanded_df[~expanded_df[expanded_df_match_columns].all(axis=1)]

expanded_df_rows_with_false

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match,expected_operatingincome,operatingincome_match,expected_operatingexpenses,operatingexpenses_match,expected_depreciationamortization,depreciationamortization_match
553,ABMD,2018,5.937490e+08,0.000000,1.571370e+08,112170000.0,0.188918,4.951680e+08,0.833969,3.380310e+08,...,0.000000,False,9.858100e+07,True,1.571370e+08,True,3.380310e+08,True,7317000.0,False
552,ABMD,2019,7.694320e+08,0.295888,2.248120e+08,259016000.0,0.336633,6.398650e+08,0.831607,4.150530e+08,...,0.295888,False,1.295670e+08,True,2.248120e+08,True,4.150530e+08,True,5955000.0,False
551,ABMD,2020,8.408830e+08,0.092862,2.492190e+08,203009000.0,0.241424,6.895780e+08,0.820064,4.403590e+08,...,0.092862,False,1.513050e+08,True,2.492190e+08,True,4.403590e+08,True,8263000.0,True
550,ABMD,2021,8.475220e+08,0.007895,2.295570e+08,225525000.0,0.266099,6.856150e+08,0.808964,4.560580e+08,...,0.007895,True,1.619070e+08,True,2.295570e+08,True,4.560580e+08,True,17380000.0,True
549,ABMD,2022,1.031753e+09,0.217376,3.726920e+08,136505000.0,0.132304,8.435950e+08,0.817633,5.868890e+08,...,0.217376,False,1.881580e+08,True,2.567060e+08,False,4.709030e+08,False,28089000.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483,ZBRA,2019,4.485000e+09,0.000000,6.920000e+08,544000000.0,0.121293,2.100000e+09,0.468227,1.376000e+09,...,0.000000,False,2.385000e+09,True,7.240000e+08,False,1.408000e+09,False,151000000.0,False
482,ZBRA,2020,4.448000e+09,-0.008250,6.510000e+08,504000000.0,0.113309,2.003000e+09,0.450315,1.318000e+09,...,-0.008250,False,2.445000e+09,True,6.850000e+08,False,1.352000e+09,False,85000000.0,False
481,ZBRA,2021,5.627000e+09,0.265063,9.790000e+08,837000000.0,0.148747,2.628000e+09,0.467034,1.617000e+09,...,0.265063,False,2.999000e+09,True,1.011000e+09,False,1.649000e+09,False,194000000.0,False
480,ZBRA,2022,5.781000e+09,0.027368,1.346000e+09,463000000.0,0.080090,2.624000e+09,0.453901,1.688000e+09,...,0.027368,False,3.157000e+09,True,9.360000e+08,False,1.278000e+09,False,-206000000.0,False


Upon closer inspection, most of the stocks with mismatched values are from the real stocks called using the Financial Modelling Prep API. This could be a reflection of the actual values using different real-world calculations than the assumptions we are using (e.g. Operating Expenses = Gross Profit - Operating Income).

We do not drop these rows.

In [176]:
expanded_df_rows_with_false['Stock'].unique()

array(['ABMD', 'ACN', 'ADBE', 'ADP', 'ADSK', 'AJG', 'ALNY', 'AMGN', 'AMP',
       'ANSS', 'ANTM', 'AON', 'APD', 'APP', 'ARGX', 'ASML', 'ASR', 'ATRI',
       'AVGOP', 'AXON', 'AXP', 'AYI', 'AZO', 'BAC-PL', 'BH-A', 'BIO',
       'BIO-B', 'BKNG', 'BLD', 'BLK', 'BRK-A', 'BRK-B', 'BURL', 'CABO',
       'CACC', 'CACI', 'CASY', 'CAT', 'CB', 'CDNS', 'CEG', 'CHE', 'CHTR',
       'CI', 'CMI', 'COIN', 'COKE', 'COST', 'CPAY', 'CRM', 'CRWD', 'CSL',
       'CSWI', 'CVCO', 'CW', 'CYBR', 'DDS', 'DE', 'DHR-PA', 'DHR-PB',
       'DJCO', 'DPZ', 'DUOL', 'EFX', 'ELV', 'EME', 'EQIX', 'ERIE', 'ESLT',
       'ESS', 'ETN', 'EVR', 'FCNCA', 'FDS', 'FDX', 'FFIV', 'FICO', 'FIX',
       'FLT', 'FLUT', 'FTV-PA', 'GD', 'GEV', 'GHC', 'GPI', 'GS', 'GWW',
       'HCA', 'HD', 'HUBB', 'HUBS', 'HUM', 'IDXX', 'IESC', 'INTU', 'ISRG',
       'IT', 'JLL', 'JPM', 'KAI', 'KLAC', 'KNSL', 'KRTX', 'KSU', 'LAD',
       'LII', 'LIN', 'LLY', 'LMT', 'LPLA', 'LULU', 'MA', 'MANH', 'MAR',
       'MCD', 'MCK', 'MCO', 'MDGL', 'MEDP', 'MELI'

In [178]:
# expanded_df.to_csv('expanded_df_v6.csv', index=False)
# partial_df.to_csv('partial_df_v6.csv', index=False)
# historical_df.to_csv('historical_df_v5.csv', index=False)
# sparse_df.to_csv('sparse_df_v5.csv', index=False)

In [179]:
expanded_df.shape

(100905, 36)

In [180]:
partial_df.shape

(7997, 36)

In [181]:
historical_df.shape

(407, 7)

In [182]:
sparse_df.shape

(478, 3)

# 4. Merge all datasets into one

Now that the datasets have been cleaned, we merge all 4 datasets into one.

We use concat to join since the datasets have the same structure, and we only want to combine rows.

In [183]:
merged_df = pd.concat([expanded_df, partial_df], axis=0, ignore_index=True).drop_duplicates()

In [184]:
merged_df = pd.concat([merged_df, historical_df, sparse_df], axis=0, ignore_index=True).drop_duplicates()

In [185]:
merged_df

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match,expected_operatingincome,operatingincome_match,expected_operatingexpenses,operatingexpenses_match,expected_depreciationamortization,depreciationamortization_match
0,AAAR,2000,1.226394e+09,0.000000,3.822607e+08,3.821538e+08,0.311608,7.326731e+08,0.597421,3.504124e+08,...,0.000000,False,4.937207e+08,True,3.822607e+08,True,3.504124e+08,True,1056139.0,True
1,AAAR,2001,1.363043e+09,0.111424,2.826937e+08,2.823476e+08,0.207145,5.723752e+08,0.419924,2.896814e+08,...,0.111424,False,7.906683e+08,True,2.826937e+08,True,2.896814e+08,True,7333540.0,True
2,AAAR,2002,9.680363e+08,-0.289798,1.493846e+08,1.489654e+08,0.153884,4.224707e+08,0.436420,2.730861e+08,...,-0.289798,False,5.455656e+08,True,1.493846e+08,True,2.730861e+08,True,3297207.0,True
3,AAAR,2003,7.725786e+08,-0.201912,8.830040e+07,8.753229e+07,0.113299,3.959663e+08,0.512526,3.076659e+08,...,-0.201912,False,3.766123e+08,True,8.830040e+07,True,3.076659e+08,True,4757430.0,True
4,AAAR,2004,9.481924e+07,-0.877269,1.385745e+07,1.312690e+07,0.138441,4.645290e+07,0.489910,3.259545e+07,...,-0.877269,False,4.836634e+07,True,1.385745e+07,True,3.259545e+07,True,569588.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109750,RXDK,2003,7.185620e+08,,,,,,,,...,,,,,,,,,,
109751,RXDK,2004,1.337162e+09,,,,,,,,...,,,,,,,,,,
109752,RXDK,2005,9.438851e+07,,,,,,,,...,,,,,,,,,,
109753,RXDK,2006,7.436636e+08,,,,,,,,...,,,,,,,,,,


#### Checking Merged Dataset

In [186]:
# Checking shape of merged dataset
merged_df.shape

(109755, 36)

In [187]:
# Ensure no more duplicate rows
duplicate_rows = merged_df[merged_df.duplicated()]
duplicate_rows

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match,expected_operatingincome,operatingincome_match,expected_operatingexpenses,operatingexpenses_match,expected_depreciationamortization,depreciationamortization_match


There will be NaN values from merging other datasets. 

We need to ensure that there are no completely null rows.

In [188]:
merged_df = merged_df.dropna(how='all')

if merged_df.isnull().all(axis=1).any():
    print("There are rows where all values are null.")
else:
    print("No completely null rows.")

No completely null rows.


In [189]:
# Check data types
merged_df.dtypes

Stock                                 object
Year                                   int64
Revenue                              float64
Revenue Growth                       float64
Operating Income                     float64
Net Income                           float64
Net Income Ratio                     float64
Gross Profit                         float64
Gross Profit Ratio                   float64
Operating Expenses                   float64
Cost of Revenue                      float64
EBITDA                               float64
Interest Expense                     float64
Depreciation & Amortization          float64
expected_EBITDA                      float64
EBITDA_match                          object
expected_netincome                   float64
netincome_match                       object
expected_grossprofit                 float64
grossprofit_match                     object
expected_netincome_ratio             float64
netincomeratio_match                  object
expected_g

In [190]:
# Summary of merged dataset
merged_df.describe()

Unnamed: 0,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,Cost of Revenue,...,expected_netincome,expected_grossprofit,expected_netincome_ratio,expected_grossprofit_ratio,expected_revenue,expected_revenuegrowth,expected_costofrevenue,expected_operatingincome,expected_operatingexpenses,expected_depreciationamortization
count,109755.0,109755.0,108870.0,108870.0,108870.0,108870.0,109234.0,108870.0,108870.0,109234.0,...,108870.0,108870.0,108870.0,108870.0,109234.0,108870.0,108870.0,108870.0,108870.0,108870.0
mean,2012.814879,1288052000.0,inf,200961600.0,189254500.0,0.141341,575910000.0,0.450374,395468500.0,705264000.0,...,194703400.0,584256000.0,0.14134,0.450343,1281174000.0,inf,713633100.0,180853500.0,375360300.0,12888110.0
std,7.472332,5174509000.0,,1310842000.0,920827600.0,2.015173,2381924000.0,0.091703,2372917000.0,3403115000.0,...,1238412000.0,2657713000.0,2.015173,0.091664,5054281000.0,,3603966000.0,1661071000.0,1375360000.0,323865100.0
min,2000.0,0.0,-0.9995006,-26985000000.0,-22819000000.0,-663.119266,-259078000.0,-5.77678,0.0,0.0,...,-48704000000.0,-259078000.0,-663.119266,-5.77678,0.0,-0.9995006,0.0,-186651000000.0,-29254000000.0,-52572000000.0
25%,2006.0,549152900.0,-0.4575421,37379030.0,36746020.0,0.071269,238580300.0,0.375018,158416900.0,293337600.0,...,36752830.0,238812300.0,0.071269,0.374998,549103900.0,-0.4575421,293593500.0,37238370.0,158353800.0,2608643.0
50%,2013.0,1096443000.0,0.0,124318100.0,123614900.0,0.148225,474869700.0,0.449813,316750600.0,587798100.0,...,123669800.0,475102400.0,0.14823,0.449782,1096249000.0,0.0,587978200.0,124099000.0,316655100.0,5090860.0
75%,2020.0,1647164000.0,0.8511327,259534600.0,258618500.0,0.225067,721625200.0,0.525462,481213200.0,884871300.0,...,258852100.0,721775800.0,0.225067,0.52544,1647294000.0,0.8511327,885173100.0,259221500.0,481055600.0,7583524.0
max,2024.0,367533000000.0,inf,137236000000.0,96223000000.0,2.285934,193405000000.0,1.063719,264766000000.0,296367000000.0,...,136051000000.0,195187000000.0,2.285934,1.0,367533000000.0,inf,296367000000.0,137236000000.0,140447000000.0,24693000000.0


**Year**: 

Min year is 2000, Max year is 2024. Median year is 2013. The mean year 2012 suggests a midpoint near the median, indicating that the years are evenly distributed and not heavily skewed.

**Revenue Growth**: 

Need to explore Mean and Max Revenue Growth (inf) as it might represent outliers.

**Operating Income, Net Income, Net Income Ratio, Gross Profit, Gross Profit Ratio, Operating Expenses**:

The negative min values may need to be investigated for accuracy or treated by replacing or refining.

**Cost of Revenue**: 

The large standard deviation (3.403115e+09) suggests outliers or extreme cases.

In [191]:
# 9564 rows still have rows with impossible negative values
negative_rows = merged_df[
    (merged_df[['Operating Income', 'Net Income', 'Net Income Ratio', 
                'Gross Profit', 'Gross Profit Ratio', 'Operating Expenses']] < 0).any(axis=1)
]

print(negative_rows)

         Stock  Year       Revenue  Revenue Growth  Operating Income  \
5         AAAR  2005  6.050416e+08        5.381000     -2.464341e+07   
38      AAGYLB  2013  8.312850e+08       -0.598627     -1.380779e+07   
39      AAGYLB  2014  4.357647e+08       -0.475794     -1.429305e+07   
54      AAHKPV  2004  7.513738e+08       -0.480948     -5.106996e+06   
58      AAHKPV  2008  2.082850e+09        1.786350     -1.037638e+08   
...        ...   ...           ...             ...               ...   
108783  ZTASWV  2023  2.625965e+08       -0.787795     -1.001228e+07   
108805   ZVHCG  2023  1.266791e+09        0.193511     -1.083619e+08   
108819  ZWDIJC  2023  1.357441e+09       -0.154495     -6.649307e+06   
108830  ZXBJSE  2022  3.820760e+08        0.000000     -2.639081e+07   
108837  ZXKILH  2023  1.437732e+09       -0.192163     -8.332478e+07   

          Net Income  Net Income Ratio  Gross Profit  Gross Profit Ratio  \
5      -2.535253e+07         -0.041902  1.957464e+08       

In [192]:
# Remove rows with impossible negative values
merged_df = merged_df[~(
    (merged_df[['Operating Income', 'Net Income', 'Net Income Ratio', 
                'Gross Profit', 'Gross Profit Ratio', 'Operating Expenses']] < 0).any(axis=1)
)]

In [193]:
merged_df

Unnamed: 0,Stock,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,...,expected_revenuegrowth,revenuegrowth_match,expected_costofrevenue,costofrevenue_match,expected_operatingincome,operatingincome_match,expected_operatingexpenses,operatingexpenses_match,expected_depreciationamortization,depreciationamortization_match
0,AAAR,2000,1.226394e+09,0.000000,3.822607e+08,3.821538e+08,0.311608,7.326731e+08,0.597421,3.504124e+08,...,0.000000,False,4.937207e+08,True,3.822607e+08,True,3.504124e+08,True,1056139.0,True
1,AAAR,2001,1.363043e+09,0.111424,2.826937e+08,2.823476e+08,0.207145,5.723752e+08,0.419924,2.896814e+08,...,0.111424,False,7.906683e+08,True,2.826937e+08,True,2.896814e+08,True,7333540.0,True
2,AAAR,2002,9.680363e+08,-0.289798,1.493846e+08,1.489654e+08,0.153884,4.224707e+08,0.436420,2.730861e+08,...,-0.289798,False,5.455656e+08,True,1.493846e+08,True,2.730861e+08,True,3297207.0,True
3,AAAR,2003,7.725786e+08,-0.201912,8.830040e+07,8.753229e+07,0.113299,3.959663e+08,0.512526,3.076659e+08,...,-0.201912,False,3.766123e+08,True,8.830040e+07,True,3.076659e+08,True,4757430.0,True
4,AAAR,2004,9.481924e+07,-0.877269,1.385745e+07,1.312690e+07,0.138441,4.645290e+07,0.489910,3.259545e+07,...,-0.877269,False,4.836634e+07,True,1.385745e+07,True,3.259545e+07,True,569588.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109750,RXDK,2003,7.185620e+08,,,,,,,,...,,,,,,,,,,
109751,RXDK,2004,1.337162e+09,,,,,,,,...,,,,,,,,,,
109752,RXDK,2005,9.438851e+07,,,,,,,,...,,,,,,,,,,
109753,RXDK,2006,7.436636e+08,,,,,,,,...,,,,,,,,,,


In [194]:
merged_df.describe()

Unnamed: 0,Year,Revenue,Revenue Growth,Operating Income,Net Income,Net Income Ratio,Gross Profit,Gross Profit Ratio,Operating Expenses,Cost of Revenue,...,expected_netincome,expected_grossprofit,expected_netincome_ratio,expected_grossprofit_ratio,expected_revenue,expected_revenuegrowth,expected_costofrevenue,expected_operatingincome,expected_operatingexpenses,expected_depreciationamortization
count,100191.0,100191.0,99306.0,99306.0,99306.0,99306.0,99670.0,99306.0,99306.0,99670.0,...,99306.0,99306.0,99306.0,99306.0,99670.0,99306.0,99306.0,99306.0,99306.0,99306.0
mean,2012.816061,1298722000.0,2.834273,223658300.0,211683000.0,0.165789,593120700.0,0.461391,390771500.0,698118800.0,...,217011600.0,602333600.0,0.165789,0.461357,1291240000.0,2.834273,707267700.0,202864000.0,369977200.0,13974910.0
std,7.475134,5217603000.0,28.277972,1362351000.0,955152700.0,0.092602,2401746000.0,0.084817,2292548000.0,3437781000.0,...,1286195000.0,2700396000.0,0.092602,0.084775,5087051000.0,28.277972,3655443000.0,1712221000.0,1295191000.0,288065500.0
min,2000.0,69931.0,-0.999084,132618.9,441.5663,2e-06,0.0,0.0,0.0,0.0,...,-48704000000.0,0.0,2e-06,0.0,0.0,-0.999084,0.0,-186651000000.0,-29254000000.0,-34832000000.0
25%,2006.0,556498800.0,-0.451841,56902160.0,56308770.0,0.092684,248722800.0,0.394067,157117200.0,291848400.0,...,56299110.0,248881900.0,0.092684,0.394047,556319800.0,-0.451841,292215900.0,56765110.0,157054200.0,2609362.0
50%,2013.0,1100775000.0,0.0,143533900.0,142935400.0,0.161458,489483700.0,0.463116,311399600.0,580261500.0,...,142963500.0,489779800.0,0.161458,0.463088,1100655000.0,0.0,580447400.0,143396800.0,311274400.0,5096295.0
75%,2020.0,1648227000.0,0.853648,276574200.0,275670300.0,0.232938,742940600.0,0.531942,471637700.0,868981300.0,...,275878000.0,743188100.0,0.232938,0.531917,1648433000.0,0.853648,869102100.0,276249300.0,471530600.0,7587542.0
max,2024.0,367533000000.0,1889.754699,137236000000.0,96223000000.0,2.285934,193405000000.0,1.063719,242950000000.0,296367000000.0,...,136051000000.0,195187000000.0,2.285934,1.0,367533000000.0,1889.754699,296367000000.0,137236000000.0,89173000000.0,24693000000.0


In [195]:
# merged_df.to_csv('merged_df_v4.csv', index=False)