In [2]:
# File dimensions: 50000 rows × 40 columns

# pandas offers great tools for analyzing data sets, provides in-memory 2d table object called Dataframe
# plenty of documentation for the package can be found online, seems to be a great community behind its usage
import pandas as pd

# numpy package mostly used for determining data types of data values
import numpy as np

# time and datetime packages used for determining the runtime of code
import time
import datetime

start = time.time()
begin_time = datetime.datetime.now()

# Reading in the two csv files that the script will apply the comparison on
base = pd.read_csv("RC base.csv", low_memory=False)
test = pd.read_csv("RC test - full.csv", low_memory=False)

# cols represents the names of the attributes in the file
cols = base.columns
# index_ used to parse columns for testing purposes
index_ = cols.tolist()

# parse data for testing purposes to reduce size of data input
# 0:20
# 150:200
#del index_[0:20]
#base = base.drop(columns=index_)
#test = test.drop(columns=index_)

# reinitialize cols to updated data input size (testing purposes)
cols = base.columns
# cols_dtypes used to determine the type of data that each attribute represents
cols_dtypes = base.dtypes

# num used to parse data input
num = 50000
base = base.head(num)
test = test.head(num)

# initialize counts that will be used to determine
total_counts = [0] * len(cols)
diff_counts = [0] * len(cols)
up_counts = [0] * len(cols)
down_counts = [0] * len(cols)

# iterates through each row of the base and test file
for (indexf1, rowf1), (indexf2, rowf2) in zip(base.iterrows(), test.iterrows()):
    
    # iterates through each attribute of a record
    for i in range(0, len(cols)):
        
        # data is a numerical value
        if cols_dtypes[i] == np.int64 or cols_dtypes[i] == np.float64:
            
            # at least one record has data for this specific attribute
            if not(rowf1[i] == -1 and rowf2[i] == -1):
                total_counts[i] += 1

            # f1 < f2 => increase
            if rowf1[i] < rowf2[i]:
                up_counts[i] += 1
                diff_counts[i] += 1
                
            # f1 > f2 => decrease
            elif rowf1[i] > rowf2[i]:
                down_counts[i] += 1
                diff_counts[i] += 1
        
        # data is a string value
        else:
            
            # at least one record has data for this specific attribute
            if not(rowf1[i] == '-1' and rowf2[i] == '-1'):
                total_counts[i] += 1
            
            # special check for NaN values (not all NaN values equal each other for some reason?)
            if not(pd.isna(rowf1[i]) or pd.isna(rowf2[i])):
                if (rowf1[i] != rowf2[i]):
                    diff_counts[i] += 1

# -1 entries will have adverse effects on numerical statistical measures such as mean and standard deviation;
# replacing all -1's with NaN bypasses these unintended effects for a more accurate description of the data
base = base.replace(-1, np.NaN)
test = test.replace(-1, np.NaN)

# base_t and test_t hold information on mean and std, applied transpose for easy access of these statistical measures
base_t = base.describe().transpose()
test_t = test.describe().transpose()

index = -1
data = []
df_columns = ['index', 'dtype', 'field', 'total_cnt', 'diff_cnt', 'diff_pct', 'up_cnt', 'up_pct', 'down_cnt', 'down_pct', 
              'mean_f1', 'mean_f2', 'mean_diff (f2-f1)', 'std_f1', 'std_f2', 'std_diff (f2-f1)', 'min_f1', 'min_f2', 
              'max_f1', 'max_f2']

# option to exclude any attributes that experienced no changes across all records between the two files
excludeNoChange = False
for (x0, x1, x2, x3, x4, x5) in zip(cols_dtypes, cols, total_counts, diff_counts, up_counts, down_counts):
    
    index += 1
    if excludeNoChange == True:
        if x3 == 0:
            continue
    
    # handles special case where there is a count of 0 to avoid division by 0 error
    if (x2 == 0):
        row = [index, cols_dtypes[index], x1, x2, x3, -1, x4, -1, x5, -1]
    else:
        row = [index, cols_dtypes[index], x1, x2, x3, x3/x2*100, x4, x4/x2*100, x5, x5/x2*100]

    # numerical statistical measures for numerical data values ONLY
    if (x0 == np.int64 or x0 == np.float64):
        mean_f1 = base_t['mean'][x1]
        mean_f2 = test_t['mean'][x1]
        row.extend((mean_f1, mean_f2, mean_f2 - mean_f1))
        
        std_f1 = base_t['std'][x1]
        std_f2 = test_t['std'][x1]
        row.extend((std_f1, std_f2, std_f2 - std_f1))
        
        min_f1 = base_t['min'][x1]
        min_f2 = test_t['min'][x1]
        row.extend((min_f1, min_f2))
        
        max_f1 = base_t['max'][x1]
        max_f2 = test_t['max'][x1]
        row.extend((max_f1, max_f2))
    
    else:
        row.extend((np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, 'N/A', 'N/A', 'N/A', 'N/A'))
    
    data.append(row)

    
print(datetime.datetime.now() - begin_time)
end = time.time()
print(f"TIME: {end - start}")
    
df = pd.DataFrame(data, columns=df_columns)
title = f'RC_Full Comparison, Runtime - {(int)(end-start)}s.csv'
df.to_csv(title, index=False)

0:02:42.810186
TIME: 162.81118297576904
