# 00 - Load libraries

In [1]:
import polars as pl
import polars.selectors as cs

pl.Config.set_tbl_rows(100)
pl.Config.set_tbl_cols(100)

import matplotlib.pyplot as plt 
import seaborn as sns

from collections import Counter

import warnings 
warnings.filterwarnings("ignore")
from path import * 

In [2]:
TRAIN_TRANSACTION_PATH = RAW_DATA_DIR / "train_transaction.csv"
TRAIN_IDENTITY_PATH    = RAW_DATA_DIR / "train_identity.csv"
TEST_IDENITY_PATH     = RAW_DATA_DIR / "test_identity.csv"
TEST_TRANSACTION_PATH = RAW_DATA_DIR / "test_transaction.csv"

# 01 - Load data

In [3]:
tran_df = pl.read_csv(TRAIN_TRANSACTION_PATH)
iden_df  = pl.read_csv(TRAIN_IDENTITY_PATH)
test_tran_df = pl.read_csv(TEST_TRANSACTION_PATH)
test_iden_df  = pl.read_csv(TEST_IDENITY_PATH)

df = tran_df.join(iden_df, on='TransactionID', how='left')
test_df = test_tran_df.join(test_iden_df, on='TransactionID', how='left')

assert test_df.shape[0] == test_tran_df.shape[0]
assert df.shape[0] == tran_df.shape[0]

print(f"Train data shape: {df.shape} with {df.estimated_size() / (1024**2):.2f} MB")
print(f"Test data shape: {test_df.shape} with {test_df.estimated_size() / (1024**2):.2f} MB")
del tran_df, iden_df, test_tran_df, test_iden_df

Train data shape: (590540, 434) with 1863.47 MB
Test data shape: (506691, 433) with 1596.20 MB


In [4]:
assert test_df.select(pl.col("TransactionDT").min()).item() > df.select(pl.col("TransactionDT").max()).item(), \
       "Test data should be later than train data"
print("We might trust the test data does not leak any information from the train data")

We might trust the test data does not leak any information from the train data


## 01.01 - Sanity check

In [5]:
df.glimpse()

Rows: 590540
Columns: 434
$ TransactionID  <i64> 2987000, 2987001, 2987002, 2987003, 2987004, 2987005, 2987006, 2987007, 2987008, 2987009
$ isFraud        <i64> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ TransactionDT  <i64> 86400, 86401, 86469, 86499, 86506, 86510, 86522, 86529, 86535, 86536
$ TransactionAmt <f64> 68.5, 29.0, 59.0, 50.0, 50.0, 49.0, 159.0, 422.5, 15.0, 117.0
$ ProductCD      <str> 'W', 'W', 'W', 'W', 'H', 'W', 'W', 'W', 'H', 'W'
$ card1          <i64> 13926, 2755, 4663, 18132, 4497, 5937, 12308, 12695, 2803, 17399
$ card2          <f64> null, 404.0, 490.0, 567.0, 514.0, 555.0, 360.0, 490.0, 100.0, 111.0
$ card3          <f64> 150.0, 150.0, 150.0, 150.0, 150.0, 150.0, 150.0, 150.0, 150.0, 150.0
$ card4          <str> 'discover', 'mastercard', 'visa', 'mastercard', 'mastercard', 'visa', 'visa', 'visa', 'visa', 'mastercard'
$ card5          <f64> 142.0, 102.0, 166.0, 117.0, 102.0, 226.0, 166.0, 226.0, 226.0, 224.0
$ card6          <str> 'credit', 'credit', 'debit', 'debit', 'credit'

In [6]:
df.select(cs.numeric()).describe()

statistic,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,dist2,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,V1,V2,V3,V4,V5,V6,V7,V8,…,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,id_11,id_13,id_14,id_17,id_18,id_19,id_20,id_21,id_22,id_24,id_25,id_26,id_32
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",590540.0,590540.0,590540.0,590540.0,590540.0,581607.0,588975.0,586281.0,524834.0,524834.0,238269.0,37627.0,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,589271.0,309743.0,327662.0,421618.0,280699.0,73187.0,38917.0,74926.0,74926.0,514518.0,311253.0,64717.0,61952.0,62187.0,501427.0,311253.0,311253.0,311253.0,311253.0,311253.0,311253.0,311253.0,311253.0,…,589271.0,589271.0,589271.0,590528.0,590528.0,590528.0,590528.0,590528.0,590528.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,144233.0,140872.0,66324.0,66324.0,136865.0,136865.0,5155.0,5155.0,74926.0,74926.0,140978.0,127320.0,80044.0,139369.0,45113.0,139318.0,139261.0,5159.0,5169.0,4747.0,5132.0,5163.0,77586.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,8933.0,1565.0,4259.0,65706.0,65706.0,352271.0,552913.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1269.0,280797.0,262878.0,168922.0,309841.0,517353.0,551623.0,515614.0,515614.0,76022.0,279287.0,525823.0,528588.0,528353.0,89113.0,279287.0,279287.0,279287.0,279287.0,279287.0,279287.0,279287.0,279287.0,…,1269.0,1269.0,1269.0,12.0,12.0,12.0,12.0,12.0,12.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,446307.0,449668.0,524216.0,524216.0,453675.0,453675.0,585385.0,585385.0,515614.0,515614.0,449562.0,463220.0,510496.0,451171.0,545427.0,451222.0,451279.0,585381.0,585371.0,585793.0,585408.0,585377.0,512954.0
"""mean""",3282269.5,0.03499,7372300.0,135.027176,9898.734658,362.555488,153.194925,199.278897,290.733794,86.80063,118.50218,231.855423,14.092458,15.269734,0.005644,4.092185,5.571526,9.071082,2.848478,5.144574,4.48024,5.240343,10.241521,4.076227,32.539918,8.295215,94.347568,169.563231,28.343348,140.002441,42.335965,69.805717,41.63895,146.058108,0.561057,123.982137,146.621465,54.037533,17.901295,57.724444,163.744579,0.999945,1.045204,1.078075,0.846456,0.876991,1.045686,1.07287,1.027704,…,21.351473,43.319174,26.806977,109.818544,247.606741,162.153398,18.372476,42.073133,28.326584,6.220289,13.103775,9.184612,0.058494,0.85104,0.296633,0.33679,1.312844,0.775874,721.741883,1375.783644,1014.622782,9.807015,59.16455,28.530903,55.352422,151.160542,100.700882,-10.170502,174716.584708,0.060189,-0.058938,1.615585,-6.69871,13.285354,-38.600388,0.091023,-0.301124,99.745325,48.053071,-344.507146,189.451377,14.237337,353.128174,403.882666,368.26982,16.002708,12.800927,329.608924,149.070308,26.508597
"""std""",170474.358321,0.183755,4617200.0,239.162522,4901.170153,157.793246,11.336444,41.244453,101.741072,2.690623,371.872026,529.053494,133.569018,154.668899,0.150536,68.848459,25.786976,71.508467,61.727304,95.378574,16.674897,95.581443,94.336292,86.666218,129.364844,49.544262,157.660387,177.315865,62.384721,191.096774,89.000144,143.669253,99.743264,231.66384,0.31688,182.615225,186.042622,124.274558,67.614425,136.31245,202.72666,0.00739,0.240133,0.32089,0.440053,0.475902,0.239385,0.304779,0.186069,…,95.90297,173.619028,116.853222,2270.033202,3980.042828,2793.343636,332.304848,473.499307,382.053171,56.022561,106.739813,73.627893,0.304415,3.950295,1.364356,1.580144,8.769083,4.727971,6217.223583,11169.275702,7955.735482,243.861391,387.62948,274.57692,668.486833,1095.034387,814.946722,14.347949,159651.816856,0.598231,0.701015,5.249856,16.491104,11.384207,26.084899,0.983842,2.789446,1.127602,11.774858,93.695502,30.37536,1.561302,141.095343,152.160327,198.847038,6.897665,2.372447,97.461089,32.101995,3.737502
"""min""",2987000.0,0.0,86400.0,0.251,1000.0,100.0,100.0,100.0,100.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-122.0,0.0,-83.0,0.0,0.0,0.0,0.0,-53.0,-83.0,0.0,-193.0,-83.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-100.0,1.0,-13.0,-28.0,-72.0,-100.0,-46.0,-100.0,-36.0,-100.0,90.0,10.0,-660.0,100.0,10.0,100.0,100.0,100.0,10.0,11.0,100.0,100.0,0.0
"""25%""",3134635.0,0.0,3027065.0,43.321,6019.0,214.0,150.0,166.0,204.0,87.0,3.0,7.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,26.0,1.0,0.0,1.0,0.0,0.0,0.958333,0.208333,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-10.0,67992.0,0.0,0.0,0.0,-6.0,5.0,-48.0,0.0,0.0,100.0,49.0,-360.0,166.0,13.0,266.0,256.0,252.0,14.0,11.0,321.0,119.0,24.0
"""50%""",3282270.0,0.0,7306535.0,68.78,9678.0,361.0,150.0,226.0,299.0,87.0,8.0,37.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,1.0,3.0,97.0,8.0,26.0,10.0,0.0,0.0,37.875,0.666666,15.0,43.0,0.0,0.0,0.0,52.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.0,125803.0,0.0,0.0,0.0,0.0,14.0,-34.0,0.0,0.0,100.0,52.0,-300.0,166.0,15.0,341.0,472.0,252.0,14.0,11.0,321.0,149.0,24.0
"""75%""",3429904.0,0.0,11246605.0,125.0,14184.0,512.0,150.0,226.0,330.0,87.0,24.0,206.0,3.0,3.0,0.0,0.0,1.0,2.0,0.0,0.0,2.0,0.0,2.0,0.0,12.0,2.0,122.0,276.0,27.0,253.0,32.0,40.0,17.0,187.958328,0.833333,197.0,274.0,13.0,0.0,2.0,314.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.0,228748.0,0.0,0.0,1.0,0.0,22.0,-23.0,0.0,0.0,100.0,52.0,-300.0,225.0,15.0,427.0,533.0,487.0,14.0,15.0,371.0,169.0,32.0
"""max""",3577539.0,1.0,15811131.0,31937.391,18396.0,600.0,231.0,237.0,540.0,102.0,10286.0,11623.0,4685.0,5691.0,26.0,2253.0,349.0,2253.0,2255.0,3331.0,210.0,3257.0,3188.0,3188.0,2918.0,1429.0,640.0,640.0,819.0,869.0,819.0,873.0,843.0,1707.791626,0.958333,876.0,670.0,648.0,847.0,878.0,879.0,1.0,8.0,9.0,6.0,6.0,9.0,9.0,8.0,…,4817.470215,7519.870117,4817.470215,93736.0,134021.0,98476.0,104060.0,104060.0,104060.0,880.0,1411.0,976.0,12.0,44.0,18.0,15.0,99.0,55.0,160000.0,160000.0,160000.0,55125.0,55125.0,55125.0,104060.0,104060.0,104060.0,0.0,999595.0,10.0,0.0,52.0,0.0,61.0,0.0,25.0,0.0,100.0,64.0,720.0,229.0,29.0,671.0,661.0,854.0,44.0,26.0,548.0,216.0,32.0


In [7]:
test_df.select(cs.numeric()).describe()

statistic,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,dist2,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,V1,V2,V3,V4,V5,V6,V7,V8,V9,…,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339,id-01,id-02,id-03,id-04,id-05,id-06,id-07,id-08,id-09,id-10,id-11,id-13,id-14,id-17,id-18,id-19,id-20,id-21,id-22,id-24,id-25,id-26,id-32
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",506691.0,506691.0,506691.0,506691.0,498037.0,503689.0,502144.0,441082.0,441082.0,215474.0,36436.0,506688.0,506688.0,506688.0,506688.0,506688.0,506688.0,506688.0,506688.0,506688.0,506688.0,506688.0,506688.0,501943.0,506688.0,500660.0,271922.0,303549.0,429840.0,282316.0,124783.0,60133.0,74338.0,74338.0,494146.0,330173.0,69254.0,123384.0,115194.0,494622.0,330173.0,330173.0,330173.0,330173.0,330173.0,330173.0,330173.0,330173.0,330173.0,…,500660.0,500660.0,500660.0,506688.0,506688.0,506688.0,506688.0,506688.0,506688.0,76431.0,76431.0,76431.0,76431.0,76431.0,76431.0,76431.0,76431.0,76431.0,76431.0,76431.0,76431.0,76431.0,76431.0,76431.0,76431.0,76431.0,76431.0,141907.0,136976.0,66481.0,66481.0,134750.0,134750.0,5059.0,5059.0,74338.0,74338.0,136778.0,130286.0,71357.0,135966.0,50875.0,135906.0,135633.0,5059.0,5062.0,4740.0,5039.0,5047.0,70671.0
"""null_count""",0.0,0.0,0.0,0.0,8654.0,3002.0,4547.0,65609.0,65609.0,291217.0,470255.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4748.0,3.0,6031.0,234769.0,203142.0,76851.0,224375.0,381908.0,446558.0,432353.0,432353.0,12545.0,176518.0,437437.0,383307.0,391497.0,12069.0,176518.0,176518.0,176518.0,176518.0,176518.0,176518.0,176518.0,176518.0,176518.0,…,6031.0,6031.0,6031.0,3.0,3.0,3.0,3.0,3.0,3.0,430260.0,430260.0,430260.0,430260.0,430260.0,430260.0,430260.0,430260.0,430260.0,430260.0,430260.0,430260.0,430260.0,430260.0,430260.0,430260.0,430260.0,430260.0,364784.0,369715.0,440210.0,440210.0,371941.0,371941.0,501632.0,501632.0,432353.0,432353.0,369913.0,376405.0,435334.0,370725.0,455816.0,370785.0,371058.0,501632.0,501629.0,501951.0,501652.0,501644.0,436020.0
"""mean""",3916894.0,26930000.0,134.725568,9957.222175,363.735379,153.543409,200.162975,291.846514,86.723412,87.06527,237.175047,10.093211,10.714635,0.027403,2.385875,4.962701,6.854992,1.678173,1.893994,4.611943,1.810149,7.484594,2.649486,27.816035,6.08389,108.207504,188.666621,33.394727,175.060799,50.977752,82.443145,61.815642,160.834483,0.553981,159.810552,218.414895,77.404179,18.225961,58.163186,206.854137,0.999997,1.046636,1.087436,0.850563,0.884697,1.04594,1.079207,1.022779,1.039876,…,21.238136,43.269524,26.730428,42.077887,83.808337,56.090776,27.620156,46.835629,36.772511,0.436681,1.216182,0.693514,0.047926,0.482658,0.163271,0.220486,0.460572,0.3099,315.128613,440.464333,381.102129,24.70803,58.339468,35.485643,99.05876,155.578153,139.814706,-11.325734,192658.729909,0.053008,-0.087454,1.246033,-6.803829,12.49318,-36.577782,0.076219,-0.245877,99.750876,36.905715,-344.482672,191.070341,14.795735,350.122982,408.88623,507.727021,15.336823,13.166667,332.043064,152.752923,26.217939
"""std""",146269.23696,4756500.0,245.779822,4884.960969,158.688653,12.443013,40.562461,102.06273,2.987328,314.131694,556.450834,79.334782,88.896921,0.227753,41.942977,25.462195,46.783565,37.874827,26.766115,21.28264,23.692393,61.322505,53.97306,101.114033,28.315616,176.964526,197.099611,82.55821,250.41747,116.829786,192.867671,150.299612,257.003738,0.317327,240.566557,253.091693,176.218548,78.07989,164.015525,269.419196,0.00174,0.250847,0.359033,0.427938,0.483297,0.244112,0.330086,0.174533,0.226182,…,94.662615,195.591901,115.676311,2249.338097,2919.418027,2263.811948,999.900445,1514.927745,1486.814768,2.129226,5.925144,3.343788,0.282497,2.555532,0.850853,1.789616,4.228813,2.632181,9494.045099,10648.4296,10395.559802,842.913287,990.361078,904.093921,2436.074522,4544.303974,4493.728267,14.50852,182613.277215,0.684551,0.840351,5.071394,15.921457,11.678206,25.544185,1.009687,2.120525,1.119819,13.206713,93.633156,30.749535,2.318496,139.140824,158.971756,227.371061,5.618032,3.22244,86.356683,31.916995,3.601046
"""min""",3663549.0,18403224.0,0.018,1001.0,100.0,100.0,100.0,100.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-100.0,2.0,-12.0,-19.0,-81.0,-100.0,-41.0,-100.0,-32.0,-100.0,90.0,11.0,-720.0,100.0,11.0,100.0,100.0,100.0,11.0,10.0,100.0,100.0,8.0
"""25%""",3790222.0,22771551.0,40.0,6019.0,207.0,150.0,166.0,204.0,87.0,3.0,7.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,26.0,1.0,0.0,0.0,0.0,0.0,1.083333,0.208333,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-12.0,63340.0,0.0,0.0,0.0,-6.0,3.0,-46.0,0.0,0.0,100.0,27.0,-360.0,166.0,13.0,266.0,256.0,252.0,14.0,11.0,321.0,137.0,24.0
"""50%""",3916894.0,27204658.0,67.95,9803.0,369.0,150.0,226.0,299.0,87.0,8.0,44.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,1.0,5.0,112.0,7.0,21.0,8.0,0.0,0.0,37.708332,0.666666,10.0,102.0,0.0,0.0,0.0,48.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.0,133190.0,0.0,0.0,0.0,0.0,12.0,-33.0,0.0,0.0,100.0,27.0,-300.0,166.0,15.0,321.0,484.0,576.0,14.0,11.0,321.0,147.0,24.0
"""75%""",4043567.0,31348566.0,125.0,14276.0,512.0,150.0,226.0,330.0,87.0,20.0,196.0,3.0,3.0,0.0,1.0,1.0,2.0,0.0,1.0,2.0,1.0,2.0,1.0,13.0,2.0,148.0,305.0,28.0,290.0,34.0,13.0,18.0,221.125,0.791666,250.0,401.0,25.0,0.0,0.0,370.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.0,265717.0,0.0,0.0,1.0,0.0,21.0,-23.0,0.0,0.0,100.0,52.0,-300.0,225.0,15.0,427.0,549.0,711.0,14.0,15.0,355.0,182.0,32.0
"""max""",4170239.0,34214345.0,10270.0,18397.0,600.0,232.0,237.0,540.0,102.0,8081.0,9213.0,2950.0,3275.0,31.0,1601.0,376.0,1601.0,1621.0,1005.0,572.0,881.0,2234.0,2234.0,1562.0,797.0,641.0,641.0,1076.0,1091.0,1088.0,1091.0,1088.0,2029.583374,0.958333,1091.0,883.0,879.0,1066.0,1085.0,1091.0,1.0,11.0,11.0,10.0,10.0,13.0,13.0,11.0,11.0,…,4727.959961,7539.75,4727.959961,718740.0,958320.0,718740.0,453750.0,605000.0,605000.0,86.0,128.0,108.0,8.0,44.0,31.0,85.0,125.0,106.0,1040657.5,1040657.5,1040657.5,64800.0,64800.0,64800.0,375000.0,612500.0,612500.0,0.0,999869.0,11.0,0.0,52.0,0.0,59.0,0.0,16.0,0.0,100.0,63.0,720.0,228.0,29.0,670.0,660.0,854.0,44.0,26.0,549.0,216.0,48.0


In [8]:
Counter(df.dtypes)

Counter({Float64: 399, String: 31, Int64: 4})

In [9]:
df = df.with_columns(
    cs.float().cast(pl.Float32),
    cs.integer().cast(pl.Int32),
)

test_df = test_df.with_columns(
    cs.float().cast(pl.Float32),
    cs.integer().cast(pl.Int32),
)

print(f"After downcasting, data size: {df.estimated_size() / (1024**2):.2f} MB")
print(f"After downcasting, data size: {test_df.estimated_size() / (1024**2):.2f} MB")
df.write_parquet(INTERIM_DATA_DIR / "train.parquet")
test_df.write_parquet(INTERIM_DATA_DIR / "test.parquet")

After downcasting, data size: 961.60 MB
After downcasting, data size: 823.07 MB


## 01.02 - Look for basic information

In [10]:
print(f"Number of duplicated records: {df.is_duplicated().sum()}")

Number of duplicated records: 0


In [11]:
df_stats = (
    df.select((cs.all().null_count() / pl.len()))
    .unpivot(variable_name="missing_pct")
)

import altair as alt 
alt.Chart(df_stats).mark_bar().encode(
    x=alt.X("missing_pct", bin=alt.Bin(maxbins=30), title="Missing Percentage (%)"),
    y='count()',
    tooltip=['count()']
).properties(
    width="container",
    title="Distribution of Missing Value"
).show()

In [12]:
num_cols = df.select(cs.numeric()).columns
obj_cols = df.select(cs.string()).columns
assert len(obj_cols) + len(num_cols) == df.shape[1]
print(f"Object columns (total={len(obj_cols)}): {obj_cols}")
print(f"Numerical columns (total={len(num_cols)}): {num_cols}")

Object columns (total=31): ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_15', 'id_16', 'id_23', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']
Numerical columns (total=403): ['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2', 'dist1', 'dist2', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50

## 01.03 - Look for datetime feature

In [13]:
df["TransactionDT"].describe()

statistic,value
str,f64
"""count""",590540.0
"""null_count""",0.0
"""mean""",7372300.0
"""std""",4617200.0
"""min""",86400.0
"""25%""",3027065.0
"""50%""",7306535.0
"""75%""",11246605.0
"""max""",15811131.0


In [14]:
print("Datetime Interval Asumption")
maxDT = df["TransactionDT"].max()
minDT = df["TransactionDT"].min()
print(f"1 inc = 1 day: {(maxDT - minDT) / 365:,.3f} years")
print(f"1 inc = 1 hour: {(maxDT - minDT) / (365 * 24):,.3f} years")
print(f"1 inc = 1 min: {(maxDT - minDT) / (365 * 24 * 60):,.3f} years")
print(f"1 inc = 1 sec: {(maxDT - minDT) / (365 * 24 * 60 * 60):,.3f} years")

Datetime Interval Asumption
1 inc = 1 day: 43,081.455 years
1 inc = 1 hour: 1,795.061 years
1 inc = 1 min: 29.918 years
1 inc = 1 sec: 0.499 years
