In [1]:
#Standart Python notebook imports

import itertools
import io
import os
import re

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.optimize as sopt
import scipy.stats as sstats
import seaborn as sns
import sklearn.ensemble
import sklearn.tree
from sklearn import datasets
from tqdm.auto import tqdm
from matplotlib import animation, pyplot, rc
from sklearn.model_selection import train_test_split

In [2]:
#Custom imports
from cats import cats

In [3]:
### Input and output types are onedim nparrays


import string
characters = string.printable

def stats(x):
    return np.array([x.min(), x.max(), x.mean(), x.std()])

def to_str(seq):
    return np.array(list(map(str, seq)))

def length(seq):
    res = list(map(len, seq))
    return stats(np.array(res))

# This function takes two arguments:
#
#   seq - sequence of elements being analized
#   
#   filter_func - function that takes a charater
#     and returns either 1 or 0 
#   
#   for each element in seq filter_func is applied to 
#   each symbol and then a sum is divided by element's length
#

def filtered_character(seq, filter_func):
    res = list(map(lambda s: sum([filter_func(c) for c in s]) / len(s), seq))
    return stats(np.array(res))

def numeric(seq):
    res = filtered_character(seq, lambda c: c.isnumeric())
    return stats(np.array(res))

def alphabetic(seq):
    res = filtered_character(seq, lambda c: c.isalpha())
    return stats(np.array(res))

def uppercase(seq):
    res = filtered_character(seq, lambda c: ord(c) >= ord('A') and ord(c) <= ord('Z'))
    return stats(np.array(res))

def each_character(seq):
    res = np.array([])
    for ch in characters:
        res = np.append(filtered_character(seq, lambda c: c == ch), res)
    return res
    
def uniques(seq: np.ndarray) -> np.ndarray:
    return np.array([len(np.unique(seq)) / len(seq)])
    
def each_character_on_prefix(seq, pref_len=10):
    res = np.array([])
    for i in range(pref_len):
        for ch in characters:
            occs = 0
            for s in seq:
                if len(s) > i and s[i] == ch:
                    occs += 1
            res = np.append(res, np.array([occs / seq.shape[0]]))
    return res
        
    
def features(seq):
    detectors = [
        length,
        numeric,
        alphabetic,
        uppercase,
        each_character,
        uniques,
        each_character_on_prefix,
    ]
    
    features = np.array([])
    for detector in detectors:
        features = np.append(detector(to_str(seq)), features)
    return features
    

In [4]:

def parse_labels(batch_size=100):
    path_labels = './data/labels/'
    path_data = './data/small/'

    files = [i for i in os.listdir(path_data) if i.endswith(".csv")]

    x = []
    ys = []


    # returns list!!!
    def tags_one_hot(tags):
        res = []
        for cat in cats:
            if cat in tags: res.append(1) 
            else: res.append(0)
        return res

    for file in tqdm(files):
        with open(path_labels+file, "r") as f: text = f.read()
        labels = text.strip().split('\n')
        data = pd.read_csv(path_data+file)
        columns = data.columns
        for i, col in enumerate(columns):
            tags = labels[i].split(', ')
            batch = data[col].to_numpy()[:batch_size]
            
            fs = features(batch).tolist()
            for i in fs:
                if np.isnan(i):
                    print(fs)
                    print(batch)
                    break
            x.append(fs)
            ys.append(tags_one_hot(tags))
    
    print(f"Extraced {len(x[0])} features for each dataset column!")
    return x, ys

x, ys = parse_labels()

  0%|          | 0/84 [00:00<?, ?it/s]

Extraced 1417 features for each dataset column!


In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

x_np = np.array(x)
y_np = np.array(ys)
C = len(cats)

model = []

for i in tqdm(range(C)):
    clf = RandomForestClassifier()
    clf.fit(x_np, y_np[:,i])
    model.append(clf)

  0%|          | 0/56 [00:00<?, ?it/s]

In [6]:
tests = [
    ['12/23/2004', '12/13/2005', '12/23/2002', '12/03/2004'],
    ['Bob', 'Alice', 'John', 'Donald', 'Petro'],
    ['12341234', '23452223', '34534533', '7890889'],
    ['Canada', 'USA', 'Albania', 'Ukraine', 'Japan', 'China'],
    ['1', '0', '1', '0', '1', '0'],
    ['male', 'male', 'female', 'female', 'female', 'male'],
    ['PA-7722-333', 'PH-8833-444'],
    ['Canada dfdfkdm mdf dkfkmdk dkm kdfkmd ;aowoe pwoero ewr', 'USA smfmdm df kakmd smfk', 'Albania pidor ebala sobvaakk kssfnjdnj jkdnjkfjkdnfjkdfjk ndjk jkddsf jkdsnfjk sdjkfn jksdfnjksdnfjk '],
]

for test in tests:
    print('\n'+str(test[:1])+'...')
    preds = np.zeros(len(cats))
    fs = features(np.array(test)).reshape(1,-1)
    for i in range(len(cats)):
        p = model[i].predict_proba(fs)[0]
        if p.shape[0] == 1:
            preds[i] = 0
        else: preds[i] = p[1]
    ids = preds.argsort()[-5:][::-1]
    
    for i in ids:
        print(cats[i], preds[i])
    


['12/23/2004']...
date 0.38
float 0.14
integer 0.14
id 0.11666666666666665
product-code 0.03

['Bob']...
country 0.11
diagnosis 0.04
gender 0.03
city 0.03
continent 0.03

['12341234']...
integer 0.62
id 0.27
latitude 0.2
postal-code 0.12
float 0.11

['Canada']...
country 0.37
name 0.05
diagnosis 0.03
day 0.03
continent 0.03

['1']...
boolean 0.67
integer 0.22
description 0.0
height 0.0
surname 0.0

['male']...
gender 0.33
float 0.2
boolean 0.12
country 0.06
latitude 0.05

['PA-7722-333']...
id 0.23
date 0.2
float 0.11
age 0.07
integer 0.06

['Canada dfdfkdm mdf dkfkmdk dkm kdfkmd ;aowoe pwoero ewr']...
description 0.22
country 0.16
gender 0.07
url 0.07
continent 0.05
