In [35]:
# After much research, I found that Pandas offers great tools for analyzing data sets. Plenty of documentation for the package
# can be found online, and there seems to be a great community behind its usage.
import pandas as pd
import numpy as np
import time
import datetime

start = time.time()
begin_time = datetime.datetime.now()

# Reading in the two csv files that the script will apply the comparison on
base = pd.read_csv("LI Base.csv")
test = pd.read_csv("LI Test.csv")

# Across the two files, time_ms appears to be different for all records, so I made the decision to drop this attribute;
# otherwise, all records would be considered "different" because of this attribute
base = base.drop(columns=['time_ms'])
test = test.drop(columns=['time_ms'])

# cols represents the names of the attributes in the file
cols = base.columns
# index_ used to parse columns for testing purposes
index_ = cols.tolist()

# parse data for testing purposes to reduce size of data input
#del index_[0:20]
#base = base.drop(columns=index_)
#test = test.drop(columns=index_)

# reinitialize cols to updated data input size
cols = base.columns
# cols_dtypes used to determine the type of data that each attribute represents
cols_dtypes = base.dtypes

base = base.replace(-1, np.NaN)
test = test.replace(-1, np.NaN)

# base_t and test_t hold information on mean and std, applied transpose for easy access of these statistical measures
base_t = base.describe().transpose()
test_t = test.describe().transpose()

# num used to parse data input
num = 500
base = base.head(num)
test = test.head(num)

# initialize counts that will be used to determine
total_counts = [0] * len(cols)
diff_counts = [0] * len(cols)
up_counts = [0] * len(cols)
down_counts = [0] * len(cols)

excludeNoChange = False

for index, row in base.iterrows():
    accountnumber = row['accountnumber']
    
    base_row = base.loc[base['accountnumber'] == accountnumber]
    test_row = test.loc[test['accountnumber'] == accountnumber]
    
    less = base_row.lt(test_row).values.tolist()
    more = base_row.gt(test_row).values.tolist()
    equal = base_row.eq(test_row).values.tolist()
    
    for (x, y, z) in zip(less, more, equal):
        for i in range(0, len(cols)):
            if x[i] == True:
                down_counts[i] += 1
                diff_counts[i] += 1
            if y[i] == True:
                up_counts[i] += 1
                diff_counts[i] += 1
            total_counts[i] += 1
            i += 1
    
index = -1
data = []
df_columns = ['index', 'field', 'total_cnt', 'diff_cnt', 'diff_pct', 'up_cnt', 'up_pct', 'down_cnt', 'down_pct', 
              'mean_f1', 'mean_f2', 'mean_diff (f2-f1)', 'std_f1', 'std_f2', 'std_diff (f2-f1)']
for (x0, x1, x2, x3, x4, x5) in zip(cols_dtypes, cols, total_counts, diff_counts, up_counts, down_counts):
    
    index += 1
    if excludeNoChange == True:
        if x3 == 0:
            continue
            
    if (x2 == 0):
        row = [index, x1, x2, x3, -1, x4, -1, x5, -1]
    else:
        row = [index, x1, x2, x3, x3/x2*100, x4, x4/x2*100, x5, x5/x2*100]
    
    if (x0 == np.int64 or x0 == np.float64):
        mean_f1 = base_t['mean'][x1]
        mean_f2 = test_t['mean'][x1]
        row.extend((mean_f1, mean_f2, mean_f2 - mean_f1))
        
        std_f1 = base_t['std'][x1]
        std_f2 = test_t['std'][x1]
        row.extend((std_f1, std_f2, std_f2 - std_f1))
    else:
        row.extend((np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN))
    
    data.append(row)

    
print(datetime.datetime.now() - begin_time)
end = time.time()
print(f"TIME: {end - start}")

df = pd.DataFrame(data, columns=df_columns)
df

# base.plot(kind="bar", x="accountnumber", y="ageoldestrecord")

0:14:19.815308
TIME: 859.8162910938263


Unnamed: 0,index,field,total_cnt,diff_cnt,diff_pct,up_cnt,up_pct,down_cnt,down_pct,mean_f1,mean_f2,mean_diff (f2-f1),std_f1,std_f2,std_diff (f2-f1)
0,0,seq,500,0,0.0,0,0.0,0,0.0,1.252690e+03,1.252690e+03,0.000000e+00,7.227266e+02,7.227266e+02,0.000000e+00
1,1,accountnumber,500,0,0.0,0,0.0,0,0.0,1.252690e+03,1.252690e+03,0.000000e+00,7.227266e+02,7.227266e+02,0.000000e+00
2,2,ageoldestrecord,500,479,95.8,3,0.6,476,95.2,2.823904e+02,2.988413e+02,1.645094e+01,1.234749e+02,1.234745e+02,-4.070953e-04
3,3,agenewestrecord,500,18,3.6,2,0.4,16,3.2,4.611691e+00,5.112735e+00,5.010438e-01,2.568288e+01,2.797533e+01,2.292448e+00
4,4,recentupdate,500,3,0.6,2,0.4,1,0.2,9.298597e-01,9.278557e-01,-2.004008e-03,2.556396e-01,2.589863e-01,3.346742e-03
5,5,srcsconfirmidaddrcount,500,208,41.6,75,15.0,133,26.6,3.969940e+00,4.058116e+00,8.817635e-02,2.302674e+00,2.382717e+00,8.004267e-02
6,6,creditbureaurecord,500,0,0.0,0,0.0,0,0.0,9.579158e-01,9.579158e-01,0.000000e+00,2.009827e-01,2.009827e-01,0.000000e+00
7,7,verificationfailure,500,0,0.0,0,0.0,0,0.0,2.004008e-02,2.004008e-02,0.000000e+00,1.402780e-01,1.402780e-01,0.000000e+00
8,8,ssnnotfound,500,0,0.0,0,0.0,0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
9,9,ssnfoundother,500,1,0.2,1,0.2,0,0.0,4.273504e-03,2.132196e-03,-2.141308e-03,6.530202e-02,4.617571e-02,-1.912631e-02


In [26]:
# base.head()
condition = base['ageoldestrecord']!=-1
base[condition].mean()['ageoldestrecord']


282.39039665970773

In [30]:
# test.head()
condition = test['ageoldestrecord']!=-1

print(test[condition].describe()['ageoldestrecord'])
print(test[condition].mean()['ageoldestrecord'])

count    479.000000
mean     298.841336
std      123.474524
min       46.000000
25%      201.000000
50%      299.000000
75%      397.000000
max      634.000000
Name: ageoldestrecord, dtype: float64
298.8413361169102


In [46]:
diff_count = 0

count = 0;

for index, row in base.iterrows():
    accountnumber = row['accountnumber']
    
    base_row = base.loc[base['accountnumber'] == accountnumber]
    test_row = test.loc[test['accountnumber'] == accountnumber]
    
    print(type(base_row))
    print(type(test_row))
    
    print(base_row.equals(test_row))
    
    break
    
          
    


<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
False


In [None]:
"""

    print(index+1, row['accountnumber'])

for index, row in test.iterrows():
    print(index+1, row['accountnumber'])
    
    """

In [7]:
base_t

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
seq,500.0,1.252690e+03,7.227266e+02,5.0,6.287500e+02,1.252500e+03,1.876250e+03,2.505000e+03
accountnumber,500.0,1.252690e+03,7.227266e+02,5.0,6.287500e+02,1.252500e+03,1.876250e+03,2.505000e+03
ageoldestrecord,479.0,2.823904e+02,1.234749e+02,29.0,1.850000e+02,2.820000e+02,3.810000e+02,6.180000e+02
agenewestrecord,479.0,4.611691e+00,2.568288e+01,1.0,1.000000e+00,1.000000e+00,1.000000e+00,3.030000e+02
recentupdate,499.0,9.298597e-01,2.556396e-01,0.0,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00
srcsconfirmidaddrcount,499.0,3.969940e+00,2.302674e+00,0.0,2.000000e+00,4.000000e+00,5.000000e+00,1.200000e+01
creditbureaurecord,499.0,9.579158e-01,2.009827e-01,0.0,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00
verificationfailure,499.0,2.004008e-02,1.402780e-01,0.0,0.000000e+00,0.000000e+00,0.000000e+00,1.000000e+00
ssnnotfound,468.0,0.000000e+00,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
ssnfoundother,468.0,4.273504e-03,6.530202e-02,0.0,0.000000e+00,0.000000e+00,0.000000e+00,1.000000e+00


In [26]:
base

Unnamed: 0,seq,accountnumber,ageoldestrecord,agenewestrecord,recentupdate,srcsconfirmidaddrcount,creditbureaurecord,verificationfailure,ssnnotfound,ssnfoundother,verifiedname
0,5,5,180.0,172.0,0.0,1.0,1.0,0.0,0.0,0.0,3.0
1,10,10,423.0,1.0,1.0,4.0,1.0,0.0,0.0,0.0,3.0
2,15,15,300.0,1.0,1.0,2.0,1.0,0.0,0.0,0.0,3.0
3,20,20,,,0.0,0.0,0.0,0.0,,,3.0
4,25,25,394.0,1.0,1.0,3.0,1.0,0.0,0.0,0.0,3.0
5,30,30,228.0,1.0,1.0,5.0,1.0,0.0,0.0,0.0,3.0
6,35,35,292.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,3.0
7,40,40,129.0,1.0,1.0,5.0,1.0,0.0,0.0,0.0,3.0
8,45,45,187.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,3.0
9,50,50,216.0,1.0,1.0,4.0,1.0,0.0,0.0,0.0,3.0


In [34]:
# After much research, I found that Pandas offers great tools for analyzing data sets. Plenty of documentation for the package
# can be found online, and there seems to be a great community behind its usage.
import pandas as pd
import numpy as np
import time
import datetime

start = time.time()
begin_time = datetime.datetime.now()

# Reading in the two csv files that the script will apply the comparison on
base = pd.read_csv("LI Base.csv")
test = pd.read_csv("LI Test.csv")

# Across the two files, time_ms appears to be different for all records, so I made the decision to drop this attribute;
# otherwise, all records would be considered "different" because of this attribute
base = base.drop(columns=['time_ms'])
test = test.drop(columns=['time_ms'])

# cols represents the names of the attributes in the file
cols = base.columns
# index_ used to parse columns for testing purposes
index_ = cols.tolist()

# parse data for testing purposes to reduce size of data input
#del index_[0:20]
#base = base.drop(columns=index_)
#test = test.drop(columns=index_)

# reinitialize cols to updated data input size
cols = base.columns
# cols_dtypes used to determine the type of data that each attribute represents
cols_dtypes = base.dtypes

base = base.replace(-1, np.NaN)
test = test.replace(-1, np.NaN)

# base_t and test_t hold information on mean and std, applied transpose for easy access of these statistical measures
base_t = base.describe().transpose()
test_t = test.describe().transpose()

# num used to parse data input
num = 500
base = base.head(num)
test = test.head(num)

# initialize counts that will be used to determine
total_counts = [0] * len(cols)
diff_counts = [0] * len(cols)
up_counts = [0] * len(cols)
down_counts = [0] * len(cols)

excludeNoChange = False

for (indexf1, rowf1), (indexf2, rowf2) in zip(base.iterrows(), test.iterrows()):
    for i in range(0, len(cols)):
        if cols_dtypes[i] == np.int64 or cols_dtypes[i] == np.float64:
            if rowf1[i] < rowf2[i]:
                up_counts[i] += 1;
                diff_counts[i] += 1
            elif rowf1[i] > rowf2[i]:
                down_counts[i] += 1
                diff_counts[i] += 1
            total_counts[i] += 1
        
        #if (x0 == np.int64 or x0 == np.float64):
 
    #print(indexf1, rowf1)
    #print(indexf2, rowf2)

index = -1
data = []
df_columns = ['index', 'field', 'total_cnt', 'diff_cnt', 'diff_pct', 'up_cnt', 'up_pct', 'down_cnt', 'down_pct', 
              'mean_f1', 'mean_f2', 'mean_diff (f2-f1)', 'std_f1', 'std_f2', 'std_diff (f2-f1)']
for (x0, x1, x2, x3, x4, x5) in zip(cols_dtypes, cols, total_counts, diff_counts, up_counts, down_counts):
    
    index += 1
    if excludeNoChange == True:
        if x3 == 0:
            continue
    
    if (x2 == 0):
        row = [index, x1, x2, x3, -1, x4, -1, x5, -1]
    else:
        row = [index, x1, x2, x3, x3/x2*100, x4, x4/x2*100, x5, x5/x2*100]
    
    if (x0 == np.int64 or x0 == np.float64):
        mean_f1 = base_t['mean'][x1]
        mean_f2 = test_t['mean'][x1]
        row.extend((mean_f1, mean_f2, mean_f2 - mean_f1))
        
        std_f1 = base_t['std'][x1]
        std_f2 = test_t['std'][x1]
        row.extend((std_f1, std_f2, std_f2 - std_f1))
    else:
        row.extend((np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN))
    
    data.append(row)

    
print(datetime.datetime.now() - begin_time)
end = time.time()
print(f"TIME: {end - start}")
    
df = pd.DataFrame(data, columns=df_columns)
df

0:00:36.229986
TIME: 36.23674178123474


Unnamed: 0,index,field,total_cnt,diff_cnt,diff_pct,up_cnt,up_pct,down_cnt,down_pct,mean_f1,mean_f2,mean_diff (f2-f1),std_f1,std_f2,std_diff (f2-f1)
0,0,seq,500,0,0.0,0,0.0,0,0.0,1.252690e+03,1.252690e+03,0.000000e+00,7.227266e+02,7.227266e+02,0.000000e+00
1,1,accountnumber,500,0,0.0,0,0.0,0,0.0,1.252690e+03,1.252690e+03,0.000000e+00,7.227266e+02,7.227266e+02,0.000000e+00
2,2,ageoldestrecord,500,479,95.8,476,95.2,3,0.6,2.823904e+02,2.988413e+02,1.645094e+01,1.234749e+02,1.234745e+02,-4.070953e-04
3,3,agenewestrecord,500,18,3.6,16,3.2,2,0.4,4.611691e+00,5.112735e+00,5.010438e-01,2.568288e+01,2.797533e+01,2.292448e+00
4,4,recentupdate,500,3,0.6,1,0.2,2,0.4,9.298597e-01,9.278557e-01,-2.004008e-03,2.556396e-01,2.589863e-01,3.346742e-03
5,5,srcsconfirmidaddrcount,500,208,41.6,133,26.6,75,15.0,3.969940e+00,4.058116e+00,8.817635e-02,2.302674e+00,2.382717e+00,8.004267e-02
6,6,creditbureaurecord,500,0,0.0,0,0.0,0,0.0,9.579158e-01,9.579158e-01,0.000000e+00,2.009827e-01,2.009827e-01,0.000000e+00
7,7,verificationfailure,500,0,0.0,0,0.0,0,0.0,2.004008e-02,2.004008e-02,0.000000e+00,1.402780e-01,1.402780e-01,0.000000e+00
8,8,ssnnotfound,500,0,0.0,0,0.0,0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
9,9,ssnfoundother,500,1,0.2,0,0.0,1,0.2,4.273504e-03,2.132196e-03,-2.141308e-03,6.530202e-02,4.617571e-02,-1.912631e-02


In [27]:
# After much research, I found that Pandas offers great tools for analyzing data sets. Plenty of documentation for the package
# can be found online, and there seems to be a great community behind its usage.
import pandas as pd
import numpy as np
import time
import datetime

start = time.time()
begin_time = datetime.datetime.now()

# Reading in the two csv files that the script will apply the comparison on
base = pd.read_csv("LI Base.csv")
test = pd.read_csv("LI Test.csv")

# Across the two files, time_ms appears to be different for all records, so I made the decision to drop this attribute;
# otherwise, all records would be considered "different" because of this attribute
base = base.drop(columns=['time_ms'])
test = test.drop(columns=['time_ms'])

# cols represents the names of the attributes in the file
cols = base.columns
# index_ used to parse columns for testing purposes
index_ = cols.tolist()

# parse data for testing purposes to reduce size of data input
# 0:20
# 150:200
del index_[0:20]
base = base.drop(columns=index_)
test = test.drop(columns=index_)

# reinitialize cols to updated data input size (testing purposes)
cols = base.columns
# cols_dtypes used to determine the type of data that each attribute represents
cols_dtypes = base.dtypes

# num used to parse data input
num = 500
base = base.head(num)
test = test.head(num)

# initialize counts that will be used to determine
total_counts = [0] * len(cols)
diff_counts = [0] * len(cols)
up_counts = [0] * len(cols)
down_counts = [0] * len(cols)

excludeNoChange = False

# iterates through each row of the base and test file (assuming all rows are the same record)
for (indexf1, rowf1), (indexf2, rowf2) in zip(base.iterrows(), test.iterrows()):
    for i in range(0, len(cols)):
        if cols_dtypes[i] == np.int64 or cols_dtypes[i] == np.float64:
            if not(rowf1[i] == -1 and rowf2[i] == -1):
                total_counts[i] += 1
            
            if rowf1[i] < rowf2[i]:
                up_counts[i] += 1
                diff_counts[i] += 1
            elif rowf1[i] > rowf2[i]:
                down_counts[i] += 1
                diff_counts[i] += 1
        else:
            if not(rowf1[i] == '-1' and rowf2[i] == '-1'):
                total_counts[i] += 1
                
            if (pd.isna(rowf1[i]) or pd.isna(rowf2[i])):
                print(rowf1[i], rowf2[i])
            else:
                if (rowf1[i] != rowf2[i]):
                    diff_counts[i] += 1
                
            #if (rowf1[i] == '-1' and rowf2[i] != '-1'):
             #   up_counts[i] += 1
              #  diff_counts[i] += 1
            
            #print(cols[i], type(rowf1[i]), rowf1[i], rowf2[i], rowf1[i] == rowf2[i])
        
        #if (x0 == np.int64 or x0 == np.float64):
 
    #print(indexf1, rowf1)
    #print(indexf2, rowf2)


    
base = base.replace(-1, np.NaN)
test = test.replace(-1, np.NaN)
# base_t and test_t hold information on mean and std, applied transpose for easy access of these statistical measures
base_t = base.describe().transpose()
test_t = test.describe().transpose()

index = -1
data = []
df_columns = ['index', 'dtype', 'field', 'total_cnt', 'diff_cnt', 'diff_pct', 'up_cnt', 'up_pct', 'down_cnt', 'down_pct', 
              'mean_f1', 'mean_f2', 'mean_diff (f2-f1)', 'std_f1', 'std_f2', 'std_diff (f2-f1)', 'min_f1', 'min_f2', 
              'max_f1', 'max_f2']
for (x0, x1, x2, x3, x4, x5) in zip(cols_dtypes, cols, total_counts, diff_counts, up_counts, down_counts):
    
    index += 1
    if excludeNoChange == True:
        if x3 == 0:
            continue
    
    if (x2 == 0):
        row = [index, cols_dtypes[index], x1, x2, x3, -1, x4, -1, x5, -1]
    else:
        row = [index, cols_dtypes[index], x1, x2, x3, x3/x2*100, x4, x4/x2*100, x5, x5/x2*100]

    if (x0 == np.int64 or x0 == np.float64):
        mean_f1 = base_t['mean'][x1]
        mean_f2 = test_t['mean'][x1]
        row.extend((mean_f1, mean_f2, mean_f2 - mean_f1))
        
        std_f1 = base_t['std'][x1]
        std_f2 = test_t['std'][x1]
        row.extend((std_f1, std_f2, std_f2 - std_f1))
        
        min_f1 = base_t['min'][x1]
        min_f2 = test_t['min'][x1]
        row.extend((min_f1, min_f2))
        
        max_f1 = base_t['max'][x1]
        max_f2 = test_t['max'][x1]
        row.extend((max_f1, max_f2))
        
    else:
        row.extend((np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, 'N/A', 'N/A', 'N/A', 'N/A'))
    
    data.append(row)

    
print(datetime.datetime.now() - begin_time)
end = time.time()
print(f"TIME: {end - start}")
    
df = pd.DataFrame(data, columns=df_columns)
df

0:00:01.063187
TIME: 1.0631868839263916


Unnamed: 0,index,dtype,field,total_cnt,diff_cnt,diff_pct,up_cnt,up_pct,down_cnt,down_pct,mean_f1,mean_f2,mean_diff (f2-f1),std_f1,std_f2,std_diff (f2-f1),min_f1,min_f2,max_f1,max_f2
0,0,int64,seq,500,0,0.0,0,0.0,0,0.0,1252.69,1252.69,0.0,722.72663,722.72663,0.0,5.0,5.0,2505.0,2505.0
1,1,int64,accountnumber,500,0,0.0,0,0.0,0,0.0,1252.69,1252.69,0.0,722.72663,722.72663,0.0,5.0,5.0,2505.0,2505.0
2,2,float64,ageoldestrecord,480,479,99.791667,476,99.166667,3,0.625,282.390397,298.841336,16.450939,123.474931,123.474524,-0.000407,29.0,46.0,618.0,634.0
3,3,float64,agenewestrecord,480,18,3.75,16,3.333333,2,0.416667,4.611691,5.112735,0.501044,25.682882,27.97533,2.292448,1.0,1.0,303.0,320.0
4,4,float64,recentupdate,500,3,0.6,1,0.2,2,0.4,0.92986,0.927856,-0.002004,0.25564,0.258986,0.003347,0.0,0.0,1.0,1.0
5,5,float64,srcsconfirmidaddrcount,500,208,41.6,133,26.6,75,15.0,3.96994,4.058116,0.088176,2.302674,2.382717,0.080043,0.0,0.0,12.0,12.0
6,6,float64,creditbureaurecord,500,0,0.0,0,0.0,0,0.0,0.957916,0.957916,0.0,0.200983,0.200983,0.0,0.0,0.0,1.0,1.0
7,7,float64,verificationfailure,500,0,0.0,0,0.0,0,0.0,0.02004,0.02004,0.0,0.140278,0.140278,0.0,0.0,0.0,1.0,1.0
8,8,float64,ssnnotfound,471,3,0.636943,2,0.424628,1,0.212314,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,9,float64,ssnfoundother,471,4,0.849257,2,0.424628,2,0.424628,0.004274,0.002132,-0.002141,0.065302,0.046176,-0.019126,0.0,0.0,1.0,1.0


## Most Recent Working Version

In [51]:
# File dimensions: 500 rows × 398 columns

# pandas offers great tools for analyzing data sets, provides in-memory 2d table object called Dataframe
# plenty of documentation for the package can be found online, seems to be a great community behind its usage
import pandas as pd

# numpy package mostly used for determining data types of data values
import numpy as np

# time and datetime packages used for determining the runtime of code
import time
import datetime

start = time.time()
begin_time = datetime.datetime.now()

# Reading in the two csv files that the script will apply the comparison on
base = pd.read_csv("LI Base.csv")
test = pd.read_csv("LI Test.csv")

# Across the two files, time_ms appears to be different for all records, so I made the decision to drop this attribute;
# otherwise, all records would be considered "different" because of this attribute
#base = base.drop(columns=['time_ms'])
#test = test.drop(columns=['time_ms'])

# cols represents the names of the attributes in the file
cols = base.columns
# index_ used to parse columns for testing purposes
index_ = cols.tolist()

# parse data for testing purposes to reduce size of data input
# 0:20
# 150:200
#del index_[0:20]
#base = base.drop(columns=index_)
#test = test.drop(columns=index_)

# reinitialize cols to updated data input size (testing purposes)
cols = base.columns
# cols_dtypes used to determine the type of data that each attribute represents
cols_dtypes = base.dtypes

# num used to parse data input
num = 500
base = base.head(num)
test = test.head(num)

# initialize counts that will be used to determine
total_counts = [0] * len(cols)
diff_counts = [0] * len(cols)
up_counts = [0] * len(cols)
down_counts = [0] * len(cols)

# iterates through each row of the base and test file
for (indexf1, rowf1), (indexf2, rowf2) in zip(base.iterrows(), test.iterrows()):
    
    # iterates through each attribute of a record
    for i in range(0, len(cols)):
        
        # data is a numerical value
        if cols_dtypes[i] == np.int64 or cols_dtypes[i] == np.float64:
            
            # at least one record has data for this specific attribute
            if not(rowf1[i] == -1 and rowf2[i] == -1):
                total_counts[i] += 1

            # f1 < f2 => increase
            if rowf1[i] < rowf2[i]:
                up_counts[i] += 1
                diff_counts[i] += 1
                
            # f1 > f2 => decrease
            elif rowf1[i] > rowf2[i]:
                down_counts[i] += 1
                diff_counts[i] += 1
        
        # data is a string value
        else:
            
            # at least one record has data for this specific attribute
            if not(rowf1[i] == '-1' and rowf2[i] == '-1'):
                total_counts[i] += 1
            
            # special check for NaN values (not all NaN values equal each other for some reason?)
            if not(pd.isna(rowf1[i]) or pd.isna(rowf2[i])):
                if (rowf1[i] != rowf2[i]):
                    diff_counts[i] += 1

# -1 entries will have adverse effects on numerical statistical measures such as mean and standard deviation;
# replacing all -1's with NaN bypasses these unintended effects for a more accurate description of the data
base = base.replace(-1, np.NaN)
test = test.replace(-1, np.NaN)

# base_t and test_t hold information on mean and std, applied transpose for easy access of these statistical measures
base_t = base.describe().transpose()
test_t = test.describe().transpose()

index = -1
data = []
df_columns = ['index', 'dtype', 'field', 'total_cnt', 'diff_cnt', 'diff_pct', 'up_cnt', 'up_pct', 'down_cnt', 'down_pct', 
              'mean_f1', 'mean_f2', 'mean_diff (f2-f1)', 'std_f1', 'std_f2', 'std_diff (f2-f1)', 'min_f1', 'min_f2', 
              'max_f1', 'max_f2']

# option to exclude any attributes that experienced no changes across all records between the two files
excludeNoChange = False
for (x0, x1, x2, x3, x4, x5) in zip(cols_dtypes, cols, total_counts, diff_counts, up_counts, down_counts):
    
    index += 1
    if excludeNoChange == True:
        if x3 == 0:
            continue
    
    # handles special case where there is a count of 0 to avoid division by 0 error
    if (x2 == 0):
        row = [index, cols_dtypes[index], x1, x2, x3, -1, x4, -1, x5, -1]
    else:
        row = [index, cols_dtypes[index], x1, x2, x3, x3/x2*100, x4, x4/x2*100, x5, x5/x2*100]

    # numerical statistical measures for numerical data values ONLY
    if (x0 == np.int64 or x0 == np.float64):
        mean_f1 = base_t['mean'][x1]
        mean_f2 = test_t['mean'][x1]
        row.extend((mean_f1, mean_f2, mean_f2 - mean_f1))
        
        std_f1 = base_t['std'][x1]
        std_f2 = test_t['std'][x1]
        row.extend((std_f1, std_f2, std_f2 - std_f1))
        
        min_f1 = base_t['min'][x1]
        min_f2 = test_t['min'][x1]
        row.extend((min_f1, min_f2))
        
        max_f1 = base_t['max'][x1]
        max_f2 = test_t['max'][x1]
        row.extend((max_f1, max_f2))
    
    else:
        row.extend((np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, 'N/A', 'N/A', 'N/A', 'N/A'))
    
    data.append(row)

    
print(datetime.datetime.now() - begin_time)
end = time.time()
print(f"TIME: {end - start}")
    
df = pd.DataFrame(data, columns=df_columns)
title = f'LI Comparison, Runtime - {(int)(end-start)}s.csv'
df.to_csv(title, index=False)

0:00:23.216041
TIME: 23.217039585113525


## Work In Progress

In [45]:
# pandas offers great tools for analyzing data sets, provides in-memory 2d table object called Dataframe
# plenty of documentation for the package can be found online, seems to be a great community behind its usage
import pandas as pd

# numpy package mostly used for determining data types of data values
import numpy as np

# time and datetime packages used for determining the runtime of code
import time
import datetime

start = time.time()
begin_time = datetime.datetime.now()

# Reading in the two csv files that the script will apply the comparison on
base = pd.read_csv("LI Base.csv")
test = pd.read_csv("LI Test.csv")

# Across the two files, time_ms appears to be different for all records, so I made the decision to drop this attribute;
# otherwise, all records would be considered "different" because of this attribute
#base = base.drop(columns=['time_ms'])
#test = test.drop(columns=['time_ms'])

# cols represents the names of the attributes in the file
cols = base.columns
# index_ used to parse columns for testing purposes
index_ = cols.tolist()

# parse data for testing purposes to reduce size of data input
# 0:20
# 150:200
del index_[0:20]
base = base.drop(columns=index_)
test = test.drop(columns=index_)

# reinitialize cols to updated data input size (testing purposes)
cols = base.columns
# cols_dtypes used to determine the type of data that each attribute represents
cols_dtypes = base.dtypes

# num used to parse data input
num = 500
base = base.head(num)
test = test.head(num)

# initialize counts that will be used to determine
total_counts = [0] * len(cols)
diff_counts = [0] * len(cols)
up_counts = [0] * len(cols)
down_counts = [0] * len(cols)


def apply_comparison(colIndex, f1, f2):
    
    # total_cnt, diff_cnt, up_cnt, down_cnt
    stats = [0, 0, 0, 0]
    
    if type(f1[0]) == np.int64 or type(f1[0]) == np.float64:
        for i in range(0, len(f1)):
             # at least one record has data for this specific attribute
            if not(f1[i] == -1 and f2[i] == -1):
                stats[0] += 1

            # f1 < f2 => increase
            if f1[i] < f2[i]:
                stats[1] += 1
                stats[2] += 1
                
            # f1 > f2 => decrease
            elif f1[i] > f2[i]:
                stats[1] += 1
                stats[3] += 1
            
        ret = [colIndex, cols_dtypes[colIndex], cols[colIndex], stats[0], stats[1], stats[1]/stats[0]*100, 
               stats[2], stats[2]/stats[0] * 100, stats[3], stats[3]/stats[0]*100]
        return ret
        
print(type(cols))


df_columns = ['index', 'dtype', 'field', 'total_cnt', 'diff_cnt', 'diff_pct', 'up_cnt', 'up_pct', 'down_cnt', 'down_pct']


stats = pd.DataFrame([], columns=df_columns)
for i in range(0, len(cols)):
    stats.iloc[i] = stats.apply(apply_comparison)

print(stats)

#for i in range(0, len(cols))
#    print(base[cols[i]])


<class 'pandas.core.indexes.base.Index'>


IndexError: single positional indexer is out-of-bounds

In [48]:
# pandas offers great tools for analyzing data sets, provides in-memory 2d table object called Dataframe
# plenty of documentation for the package can be found online, seems to be a great community behind its usage
import pandas as pd

# numpy package mostly used for determining data types of data values
import numpy as np

# time and datetime packages used for determining the runtime of code
import time
import datetime

start = time.time()
begin_time = datetime.datetime.now()

# Reading in the two csv files that the script will apply the comparison on
base = pd.read_csv("LI Base.csv")
test = pd.read_csv("LI Test.csv")

base

Unnamed: 0,time_ms,seq,accountnumber,ageoldestrecord,agenewestrecord,recentupdate,srcsconfirmidaddrcount,creditbureaurecord,verificationfailure,ssnnotfound,...,historydate,did,fnamepop,lnamepop,addrpop,ssnlength,dobpop,emailpop,ipaddrpop,hphnpop
0,2281,5,5,180.0,172.0,0.0,1.0,1.0,0.0,0.0,...,999999,4.634109e+10,1,1,1,0,1,0,0,0
1,1501,10,10,423.0,1.0,1.0,4.0,1.0,0.0,0.0,...,999999,2.219612e+09,1,1,1,0,0,0,0,1
2,1036,15,15,300.0,1.0,1.0,2.0,1.0,0.0,0.0,...,999999,2.071592e+09,1,1,1,0,0,0,0,0
3,1171,20,20,-1.0,-1.0,0.0,0.0,0.0,0.0,-1.0,...,999999,0.000000e+00,1,1,1,0,1,0,0,0
4,1451,25,25,394.0,1.0,1.0,3.0,1.0,0.0,0.0,...,999999,1.836511e+09,1,1,1,0,0,0,0,0
5,1562,30,30,228.0,1.0,1.0,5.0,1.0,0.0,0.0,...,999999,1.697828e+09,1,1,1,0,1,0,0,0
6,2206,35,35,292.0,1.0,1.0,1.0,1.0,0.0,0.0,...,999999,5.696391e+08,1,1,1,0,1,0,0,0
7,1988,40,40,129.0,1.0,1.0,5.0,1.0,0.0,0.0,...,999999,7.894149e+10,1,1,1,0,1,0,0,0
8,1571,45,45,187.0,1.0,1.0,1.0,1.0,0.0,0.0,...,999999,3.775582e+10,1,1,1,0,1,0,0,0
9,2052,50,50,216.0,1.0,1.0,4.0,1.0,0.0,0.0,...,999999,8.384260e+08,1,1,1,0,0,0,0,0


In [49]:
# pandas offers great tools for analyzing data sets, provides in-memory 2d table object called Dataframe
# plenty of documentation for the package can be found online, seems to be a great community behind its usage
import pandas as pd

# numpy package mostly used for determining data types of data values
import numpy as np

# time and datetime packages used for determining the runtime of code
import time
import datetime

start = time.time()
begin_time = datetime.datetime.now()

# Reading in the two csv files that the script will apply the comparison on
base = pd.read_csv("s.csv")
test = pd.read_csv("LI Test full.csv")

base

Unnamed: 0,time_ms,seq,accountnumber,ageoldestrecord,agenewestrecord,recentupdate,srcsconfirmidaddrcount,creditbureaurecord,verificationfailure,ssnnotfound,...,historydate,did,fnamepop,lnamepop,addrpop,ssnlength,dobpop,emailpop,ipaddrpop,hphnpop
0,578,10,10,439.0,1.0,1.0,4.0,1.0,0.0,0.0,...,999999,2.219612e+09,1,1,1,0,0,0,0,1
1,1695,100,100,350.0,1.0,1.0,6.0,1.0,0.0,0.0,...,999999,2.628196e+09,1,1,1,0,0,0,0,1
2,994,1000,1000,326.0,1.0,1.0,5.0,1.0,0.0,0.0,...,999999,6.404122e+08,1,1,1,0,0,0,0,1
3,815,10000,10000,262.0,1.0,1.0,5.0,1.0,0.0,0.0,...,999999,6.049805e+08,1,1,1,0,1,0,0,0
4,538,100000,100000,485.0,1.0,1.0,8.0,1.0,0.0,0.0,...,999999,1.030990e+09,1,1,1,9,1,0,0,1
5,803,100014,100014,73.0,1.0,1.0,1.0,1.0,0.0,0.0,...,999999,1.922370e+11,1,1,1,9,1,0,0,1
6,2692,100024,100024,329.0,1.0,1.0,2.0,1.0,0.0,0.0,...,999999,1.748262e+09,1,1,1,9,1,0,0,1
7,781,10005,10005,422.0,1.0,1.0,7.0,1.0,0.0,0.0,...,999999,1.439555e+09,1,1,1,0,0,0,0,0
8,1237,100052,100052,153.0,1.0,1.0,4.0,1.0,0.0,0.0,...,999999,1.222310e+11,1,1,1,9,1,0,0,1
9,603,100059,100059,410.0,1.0,1.0,6.0,1.0,0.0,0.0,...,999999,4.100572e+08,1,1,1,9,1,0,0,1
