# Config & Library setup

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from pyarrow import fs
import pyarrow.parquet as pq

pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)

tqdm.pandas()

In [2]:
from string import capwords
import os
import json
import subprocess

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [3]:
os.environ['HADOOP_CONF_DIR'] = "/etc/hadoop/conf/"
os.environ['JAVA_HOME'] = "/usr/jdk64/jdk1.8.0_112"
os.environ['HADOOP_HOME'] = "/usr/hdp/3.1.0.0-78/hadoop"
os.environ['ARROW_LIBHDFS_DIR'] = "/usr/hdp/3.1.0.0-78/usr/lib/"
os.environ['CLASSPATH'] = subprocess.check_output("$HADOOP_HOME/bin/hadoop classpath --glob", shell=True).decode('utf-8')

hdfs = fs.HadoopFileSystem(host="hdfs://hdfs-cluster.datalake.bigdata.local", port=8020)

2022-11-25 14:00:45,678 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
import multiprocessing as mp
NPROCESSES=20

In [5]:
from pprint import pprint

# Public data to classify valid card_id

In [6]:
card_id = pd.read_parquet('./data/raw/card_id/Quang_id_card.parquet')
card_id.shape

(5978928, 1)

In [7]:
card_id

Unnamed: 0,id_card
0,035092001221
1,142267087
2,151536372
3,291009202
4,273587795
...,...
6812435,079199004057
6812436,091199008474
6812437,371894707
6812439,094196010727


In [8]:
new_region_code = pd.read_parquet('./data/public/card_id/new_region_code.parquet')
new_region_code

Unnamed: 0,city,code
0,Hà Nội,001
1,Hà Giang,002
2,Cao Bằng,004
3,Bắc Kạn,006
4,Tuyên Quang,008
...,...,...
58,Cần Thơ,092
59,Hậu Giang,093
60,Sóc Trăng,094
61,Bạc Liêu,095


In [9]:
unclean_old_region_code = pd.read_parquet('./data/public/card_id/unclean_old_codes.parquet')
unclean_old_region_code

Unnamed: 0,city,code
0,TP Hà Nội,01
1,Quảng Ninh,10
2,Quảng Nam,20
3,Long An,30
4,TP Hồ Chí Minh,02
...,...,...
60,Bình Dương,280-281
61,Quảng Trị,19
62,Bình Phước,285
63,Thừa Thiên-Huế,19


* Hà Tây

In [16]:
non_number_mask = unclean_old_region_code['code'] == "(cũ)"
unclean_old_region_code.loc[
    non_number_mask,
    'code'
] = 11
unclean_old_region_code.loc[
    non_number_mask,
    'city'
] = 'Hà Tây'

Unnamed: 0,city,code
5,Hà Tây 11,11


* Thái Nguyên, Bình Dương

In [41]:
cities_mask = unclean_old_region_code['city'].str.contains('Thái Nguyên|Bình Dương|Gia Lai')
unclean_old_region_code[cities_mask]

Unnamed: 0,city,code
26,Gia Lai,230-231
62,Thái Nguyên,090
63,Thái Nguyên,091
64,Thái Nguyên,092
65,Bình Dương,280
66,Bình Dương,281


In [42]:
new_cities = pd.DataFrame()

In [43]:
new_cities[['city', 'code']] = [
    ['Thái Nguyên', '090'],
    ['Thái Nguyên', '091'],
    ['Thái Nguyên', '092'],
    ['Bình Dương', '280'],
    ['Bình Dương', '281'],
    ['Gia Lai', '230'],
    ['Gia Lai', '231'],
]

In [44]:
unclean_old_region_code.drop(unclean_old_region_code[cities_mask].index, inplace=True)

In [45]:
unclean_old_region_code = pd.concat([
    unclean_old_region_code,
    new_cities
], ignore_index=True)
unclean_old_region_code

Unnamed: 0,city,code
0,TP Hà Nội,01
1,Quảng Ninh,10
2,Quảng Nam,20
3,Long An,30
4,TP Hồ Chí Minh,02
...,...,...
65,Thái Nguyên,092
66,Bình Dương,280
67,Bình Dương,281
68,Gia Lai,230


* Hải Dương, Hưng Yên

In [52]:
cities_mask = unclean_old_region_code['code'] == '14'
unclean_old_region_code[cities_mask]

Unnamed: 0,city,code
68,Hải Dương,14
69,Hưng Yên,14
70,Hải Dương,14
71,Hưng Yên,14
72,Hải Dương,14
73,Hưng Yên,14


In [53]:
new_cities = pd.DataFrame()
new_cities[['city', 'code']] = [
    ['Hải Dương', '14'],
    ['Hưng Yên', '14']
]
new_cities

Unnamed: 0,city,code
0,Hải Dương,14
1,Hưng Yên,14


In [54]:
unclean_old_region_code.drop(unclean_old_region_code[cities_mask].index, inplace=True)

In [55]:
unclean_old_region_code = pd.concat([
    unclean_old_region_code,
    new_cities
], ignore_index=True)
unclean_old_region_code

Unnamed: 0,city,code
0,TP Hà Nội,01
1,Quảng Ninh,10
2,Quảng Nam,20
3,Long An,30
4,TP Hồ Chí Minh,02
...,...,...
65,Bình Dương,281
66,Gia Lai,230
67,Gia Lai,231
68,Hải Dương,14


## Unify possible code

In [59]:
unclean_old_region_code['code']

0      01
1      10
2      20
3      30
4      02
     ... 
65    281
66    230
67    231
68     14
69     14
Name: code, Length: 70, dtype: object

In [57]:
new_region_code['code']

array(['001', '002', '004', '006', '008', '010', '011', '012', '014',
       '015', '017', '019', '020', '022', '024', '025', '026', '027',
       '030', '031', '033', '034', '035', '036', '037', '038', '040',
       '042', '044', '045', '046', '048', '049', '051', '052', '054',
       '056', '058', '060', '062', '064', '066', '067', '068', '070',
       '072', '074', '075', '077', '079', '080', '082', '083', '084',
       '086', '087', '089', '091', '092', '093', '094', '095', '096'],
      dtype=object)

## Take only the possible codes

In [62]:
possible_codes = pd.concat(
    [new_region_code, unclean_old_region_code], ignore_index=True
)['code'].unique()

In [64]:
new_codes = new_region_code[['code']].drop_duplicates()
new_codes.shape

(63, 1)

In [69]:
new_codes.to_parquet('./data/public/card_id/new_codes.parquet')

In [65]:
old_codes = unclean_old_region_code[['code']].drop_duplicates()
old_codes.shape

(47, 1)

In [72]:
old_codes = old_codes.astype(str)

In [73]:
old_codes.to_parquet('./data/public/card_id/old_codes.parquet')

# Function to verify card_id

## Sep card by length

In [100]:
from string import ascii_lowercase

In [99]:
card_id.shape

(5978928, 2)

In [101]:
card_id['id_card'] = card_id['id_card'].str.lower()

In [104]:
regex_alpha = '|'.join(list(ascii_lowercase))

In [105]:
regex_alpha

'a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z'

In [111]:
non_full_digit_mask = card_id['id_card'].str.contains(regex_alpha)

In [112]:
card_id.loc[
    non_full_digit_mask,
    'is_valid'
] = False

In [113]:
non_full_digit_mask.sum()

7381

## Clean non-digit card

In [76]:
card_id['card_length'] = card_id['id_card'].str.len()

In [124]:
correct_card_mask = (card_id['card_length'].isin([12, 9]) & card_id['is_valid'].isna())
correct_card_id = card_id[correct_card_mask]
print(correct_card_id.shape)
correct_card_id.head(3)

(5891246, 3)


Unnamed: 0,id_card,card_length,is_valid
0,35092001221,12,
1,142267087,9,
2,151536372,9,


In [125]:
possible_card_mask = (card_id['card_length'].isin([11, 8]) & card_id['is_valid'].isna())
possible_card_id = card_id[possible_card_mask].copy()
print(possible_card_id.shape)
possible_card_id.head(3)

(889, 3)


Unnamed: 0,id_card,card_length,is_valid
79939,33197000188,11,
84042,22195002375,11,
94701,72199004702,11,


In [129]:
#! Run once
possible_card_id['clean_id_card'] = '0' + possible_card_id['id_card']

In [131]:
possible_card_id

Unnamed: 0,id_card,card_length,is_valid,clean_id_card
79939,33197000188,11,,033197000188
84042,22195002375,11,,022195002375
94701,72199004702,11,,072199004702
110659,72099005973,11,,072099005973
116559,30197010037,11,,030197010037
...,...,...,...,...
6808577,74301001505,11,,074301001505
6808593,74092004265,11,,074092004265
6809995,68099001275,11,,068099001275
6810197,80194012761,11,,080194012761


In [126]:
invalid_length_card = ~(correct_card_mask | possible_card_mask)

In [127]:
card_id.loc[
    ~(correct_card_mask | possible_card_mask),
    'is_valid'
] = False

In [128]:
invalid_length_card.sum()

86793

In [122]:
5891246+889+86793 == card_id.shape[0]

True

## Check card id by logic

In [95]:
POSSIBLE_GENDER_NUM = ['0', '1', '2', '3']

In [94]:
NEW_CODE_NUM = new_codes['code'].values

In [97]:
OLD_CODE_NUM = old_codes['code'].values

In [134]:
OLD_CODE_LENGTH = 9

In [135]:
NEW_CODE_LENGTH = 12

### Define logic

In [141]:
def is_old_card(card_id: str) -> bool:
    return len(card_id) == OLD_CODE_LENGTH

In [155]:
def is_new_card(card_id: str) -> bool:
    return len(card_id) == NEW_CODE_LENGTH

In [142]:
def is_valid_gender(gender_code: str) -> bool:
    return gender_code in POSSIBLE_GENDER_NUM

In [143]:
def is_valid_old_card(card_id: str) -> bool:
    if card_id[:2] in OLD_CODE_NUM:
        gender_code = card_id[2]
        return is_valid_gender(gender_code)
    
    if card_id[:3] in OLD_CODE_NUM:
        gender_code = card_id[3]
        return is_valid_gender(gender_code)
    
    return False

In [144]:
def is_valid_new_card(card_id: str) -> bool:
    if card_id[:3] in NEW_CODE_NUM:
        gender_code = card_id[3]
        return is_valid_gender(gender_code)
    
    return False

In [145]:
def is_valid_card(card_id: str) -> bool:
    # old card
    if is_old_card(card_id):
        return is_valid_old_card(card_id)
    
    # new card
    if is_new_card(card_id):
        return is_valid_new_card(card_id)
    
    return False

### Check correct_length card first

In [147]:
correct_card_id.head(3)

Unnamed: 0,id_card,card_length,is_valid
0,35092001221,12,
1,142267087,9,
2,151536372,9,


In [149]:
with mp.Pool(NPROCESSES) as pool:
    correct_card_id['is_card_valid'] = pool.map(is_valid_card, correct_card_id['id_card'])

In [151]:
correct_card_id['is_card_valid'].value_counts()

True     4483551
False    1407695
Name: is_card_valid, dtype: int64

### Check possible_length card

In [154]:
possible_card_id.head(3)

Unnamed: 0,id_card,card_length,is_valid,clean_id_card
79939,33197000188,11,,33197000188
84042,22195002375,11,,22195002375
94701,72199004702,11,,72199004702


In [157]:
with mp.Pool(NPROCESSES) as pool:
    possible_card_id['is_card_valid'] = pool.map(is_valid_card, possible_card_id['clean_id_card'])

In [158]:
possible_card_id['is_card_valid'].value_counts()

False    472
True     417
Name: is_card_valid, dtype: int64

In [159]:
possible_card_id.query('is_card_valid')

Unnamed: 0,id_card,card_length,is_valid,clean_id_card,is_card_valid
79939,33197000188,11,,033197000188,True
84042,22195002375,11,,022195002375,True
94701,72199004702,11,,072199004702,True
110659,72099005973,11,,072099005973,True
116559,30197010037,11,,030197010037,True
...,...,...,...,...,...
6808569,74193001656,11,,074193001656,True
6808577,74301001505,11,,074301001505,True
6808593,74092004265,11,,074092004265,True
6809995,68099001275,11,,068099001275,True
