In [1]:
import re
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
vowel_letters = ['a', 'e', 'i', 'o', 'u']
consonant_letters = [ 'b', 'c', 'd', 'f', 'g', 'h', 
                      'j', 'k', 'l', 'm', 'n', 'p', 
                      'q', 'r', 's', 't', 'v', 'w', 
                      'x', 'y', 'z']
    
# 这部分是辅助函数：
def num_letters(domain):
    count = 0;
    for letter in domain.lower():
        if letter in vowel_letters or letter in consonant_letters:
            count += 1
    return count


def num_vowels(domain):
    num = 0;
    for letter in domain.lower():
        if is_vowel(letter):
            num += 1
    return num


def num_consonant(domain):
    num = 0;
    for letter in domain.lower():
        if is_consonant(letter):
            num += 1
    return num


def is_letter(char):
    letter = char.lower()
    return True if letter in vowel_letters or letter in consonant_letters else False


def is_num(char):
    if char >= '0' and char <= '9':
        return True
    return False


def is_vowel(char):
    letter = char.lower()
    if letter in vowel_letters:
        return True
    return False


def is_consonant(char):
    letter = char.lower()
    if letter in consonant_letters:
        return True
    return False


def get_constant_consonant_list(domain):
    cons_list = []
    conso = re.finditer(r'([bcdfghjklmnpqrstvwxyz])*', domain.lower())
    for node in conso:
        if len(node.group()) > 1:
            cons_list.append(node.group())
    return cons_list


def get_n_gram_dict(domain, n=2):
    dict_ngram = {}
    for i in range(len(domain)-n+1):
        gram = "".join(domain[i:i+n])
        if gram not in dict_ngram:
            dict_ngram[gram] = 0
        dict_ngram[gram] += 1;
    return dict_ngram


def get_freq_dict(domain):
    freq_dict = dict(Counter(domain))
    return freq_dict
        

In [7]:
# 这部分是特征抽取函数:

# 1.domain长度
def length_of(domain):
    return len(str(domain))


# 2.元音字母占全部字符的比例 - 元音特征
def vowel_letter_ratio(domain):
    return (float) (num_vowels(domain) / length_of(domain))


# 3.连续的辅音(串数量)占全部字符的比例 - 辅音特征
def constant_consonant_ratio(domain):
    return (float) (len(get_constant_consonant_list(domain)) / length_of(domain))


# 4.数字占全部字符比例 - 数字特征
def number_ratio(domain):
    count = 0
    for letter in domain:
        if is_num(letter):
            count += 1
    return (float) (count / length_of(domain))


# 5.Domain信息熵
def calc_entropy(domain):
    ent = 0
    l = length_of(domain)
    all_letters = dict(Counter(domain)).keys()
    freq_dict = get_freq_dict(domain)
    for letter in all_letters:
        frequency = freq_dict[letter]
        ent -= (frequency/l) * np.log2(frequency/l)
    return ent

# 6.Domain中0-9、a-f总长度的比例
def count_hex_digit_words_ratio(domain):
    hex_count = 0
    for letter in domain.lower():
        if (letter >= 'a' and letter <= 'f') or is_num(letter):  #是hex digit
                hex_count += 1
    return (float) (hex_count / len(domain))

# 7.唯一出现的字母占所有出现过的字母的比例
def count_unique_letter_ratio(domain):
    domain = domain.lower()
    unq_count = 0
    freq_dict = get_freq_dict(domain)
    for value in freq_dict.values():
        if value == 1:
            unq_count += 1
    return (float) (unq_count / len(freq_dict))
        
# 8.TLD顶级域检测
def tld_is_com_or_cn(url):
    slices = url.lower().split('.')
    if slices[-1] == 'com' or slices[-1] == 'cn':
        return 1
    return 0

    
# 纯N-gram(2-gram)，要传X_train_std列表进来
def psb_n_gram(domain_list, n=2):
    domain_list = np.array(domain_list)
    CV = CountVectorizer(ngram_range=(n,n), stop_words=None, decode_error='ignore', 
                        token_pattern=r'\w', min_df=1)
    return CV.fit_transform(domain_list).toarray()

In [8]:
# 测试部分
domain = 'abcbBcz'
url = domain + '.com.cn'
print(count_hex_digit_words_ratio(domain))
print(count_unique_letter_ratio(domain))
print(tld_is_com_or_cn(url))
domain_list = ['google', '80ff81a92301bf7ed276', 'baidu', 'gmail', 'hao123', 'p52c101bf709baf2dd74']
print(psb_n_gram(domain_list))

0.8571428571428571
0.5
1
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 1 0 1 0 0 0 1 0 0 1 1 0]
 [1 0 1 0 0 1 1 1 1 0 0 1 0 0 0 1 1 1 1 1 0 1 0 0 0 0 1 0 1 0 0 0 1 0 1 1
  1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0
  0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 1 0 0 0 1 0 1 0 0 0 0]
 [0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 1 0 0 0 0 1 0 0 0]
 [1 1 0 1 0 0 1 0 0 1 1 0 1 1 1 0 0 0 0 0 1 0 1 0 0 1 1 1 0 1 1 0 0 1 1 0
  0 0 0 0 0 0 0 0 0 0 0 0 1]]
