In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import math
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate, Dropout, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
import ahocorasick

In [2]:
max_seq_length = 61

vocab_set = pd.read_csv("input/ngram_freq.csv")
token_table = pd.read_csv('input/token_lookup.csv')
token_table = token_table[token_table['Token'].str.len() > 1]
vocab_df = vocab_set.copy()
vocab_df = vocab_df[vocab_df['word'].str.len() > 1 ] 

In [3]:
def classify_words_by_quantiles(df):
    actual_words = df.copy()
    q_25, q_50, q_75, q_90 = actual_words['count'].quantile([0.25, 0.5, 0.75, 0.90])
    bins = [-float('inf'), q_25, q_50, q_75, q_90, float('inf')]
    labels = ['very_low', 'low', 'medium', 'high', 'very_high']
    actual_words['Occurance'] = pd.cut(actual_words['count'], bins=bins, labels=labels)
    new_vocab = actual_words[['word', 'count', 'Occurance']].copy()

    return new_vocab


In [4]:
def calculate_entropy(password):
    if not password:
        return 0
    freq = {}
    for char in password:
        freq[char] = freq.get(char, 0) + 1
    entropy = 0.0
    for count in freq.values():
        p = count / len(password)
        entropy -= p * math.log2(p)
    return entropy


In [6]:
new_vocab_df = classify_words_by_quantiles(vocab_df)
new_vocab_df.drop(columns = 'count' , inplace =  True)
# new_vocab_df.to_csv('input/vocab_tier.csv')

vocab_tiers = dict(zip(new_vocab_df['word'], new_vocab_df['Occurance']))
tier_priority = {'very_low': 1, 'low': 2, 'medium': 3 , 'high':4 , 'very_high':5}  # Higher number = higher priority
automaton = ahocorasick.Automaton()

for word, tier in vocab_tiers.items():
    automaton.add_word(word.lower(), (word.lower(), tier))
automaton.make_automaton()

In [7]:
def check_password_debug(password):
    text = str(password).lower()
    matched_words = set()   
    highest_tier = "none"   
    highest_priority = 0

    for end_index, (word, tier) in automaton.iter(text):
        matched_words.add(word)
        current_priority = tier_priority.get(tier, 0)
        if current_priority > highest_priority:
            highest_priority = current_priority
            highest_tier = tier

    return  highest_tier

In [8]:
model_path = 'input/model.h5'  # Adjust if your .h5 file is elsewhere
model = load_model(model_path)

from tokenizers import ByteLevelBPETokenizer

vocab_path = "input/vocab.json"
merges_path = "input/merges.txt"    

my_tokenizer = ByteLevelBPETokenizer(vocab_path, merges_path)




In [9]:
def feature_extract(password: str):
    features = {}
    password = str(password)

    # If empty
    if len(password) == 0:
        features['num_upper'] = features['num_lower'] = features['num_digits'] = features['num_special'] = 0
        features['upper_ratio'] = features['lower_ratio'] = features['digit_ratio'] = features['special_ratio'] = 0
    else:
        features['length']= len(password)
        features['uppercase']= sum(1 for char in password if char.isupper())
        features['lowercase']= sum(1 for char in password if char.islower())
        features['digits']= sum(1 for char in password if char.isdigit())
        features['special_chars']= sum(1 for char in password if not char.isalnum())
        features['vocab_tier'] = check_password_debug(password)
        features['num_upper'] = sum(1 for c in password if c.isupper())
        features['num_lower'] = sum(1 for c in password if c.islower())
        features['num_digits'] = sum(1 for c in password if c.isdigit())
        features['num_special'] = len(password) - (
        features['num_upper'] + features['num_lower'] + features['num_digits']
        )

        features['upper_ratio'] = features['num_upper'] / len(password)
        features['lower_ratio'] = features['num_lower'] / len(password)
        features['digit_ratio'] = features['num_digits'] / len(password)
        features['special_ratio'] = features['num_special'] / len(password)
        features['entropy'] = calculate_entropy(password)

    return [
        features['length'],
        features['uppercase'],
        features['lowercase'],
        features['digits'],
        features['special_chars'],
        features['vocab_tier'],
        features['num_upper'],
        features['num_lower'],
        features['num_digits'],
        features['num_special'],
        features['upper_ratio'],
        features['lower_ratio'],
        features['digit_ratio'],
        features['special_ratio'],
        features['entropy']
    ]

In [10]:
def process_sequences(sequences, max_seq_length, padding_value=0):
    # Use pad_sequences to pad or truncate the sequences
    processed_sequences = tf.keras.preprocessing.sequence.pad_sequences(
        sequences,
        maxlen=max_seq_length,
        padding='post',  # You can change to 'pre' if needed
        truncating='post',  # 'post' truncates from the end
        value=padding_value
    )
    
    return processed_sequences

In [11]:
def predict_password_strength(password_seq , features_seq, model , max_seq_length=61):
    seq_padded = password_seq
    eng_features =features_seq
    preds = model.predict({'input_seq': seq_padded, 'input_eng': eng_features}, verbose=0)
    predicted_class = np.argmax(preds, axis=1)[0]

    # d) Map numeric class -> label
    strength_labels = {0: 'Weak', 1: 'Medium', 2: 'Strong'}
    return strength_labels.get(predicted_class, 'Unknown')


In [12]:
# token_table.drop(columns = 'Unnamed: 0' , inplace = True)

# token_to_index = token_table.to_dict()

# nested_dict = token_to_index['token']
# flat_token_to_index = {token: index for index, token in nested_dict.items()}
token_table = pd.read_csv("input/token_lookup.csv")
token_table = token_table[token_table['Token'].str.len() > 1]
token_to_index = token_table.set_index('Token')['Index'].to_dict()  # or ['Value'] depending on your CSV
flat_token_to_index = token_to_index

def tokens_to_indices(token_list, token_to_index):
    # For each token, get its index (if not found, default to 0)
    return [token_to_index.get(token, 0) for token in token_list]

sample_text = '95vjo5jvi3ivnh!T'
testing = feature_extract(sample_text)

mapping = {'very_high':4 , 'high':3 , 'medium':2 , 'none':1}
testing = [mapping[item] if item in mapping else item for item in testing]

password = str(sample_text)
seq_tokens = (my_tokenizer.encode(password)).tokens
indices = tokens_to_indices(seq_tokens, flat_token_to_index)
padded_seq = process_sequences([indices], max_seq_length)
padded_seq 

padded_seq = np.array(padded_seq)
testing = np.array(testing).reshape(1,-1)

predict_password_strength(padded_seq , testing , model)

ValueError: Input 1 of layer "functional" is incompatible with the layer: expected shape=(None, 10), found shape=(1, 15)

In [13]:
def password_to_score(password_class:str):
    tier = password_class
    maps = {'Weak': 0 , 'Medium':5 , 'Strong':10}
    return maps[password_class]

In [31]:
password_to_score(predict_password_strength(padded_seq , testing , model))

InvalidArgumentError: Graph execution error:

Detected at node functional_1_1/embedding_1_1/GatherV2 defined at (most recent call last):
  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\runpy.py", line 197, in _run_module_as_main

  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\runpy.py", line 87, in _run_code

  File "C:\Users\Prathamesh Kale\AppData\Roaming\Python\Python39\site-packages\ipykernel_launcher.py", line 18, in <module>

  File "C:\Users\Prathamesh Kale\AppData\Roaming\Python\Python39\site-packages\traitlets\config\application.py", line 1075, in launch_instance

  File "C:\Users\Prathamesh Kale\AppData\Roaming\Python\Python39\site-packages\ipykernel\kernelapp.py", line 739, in start

  File "C:\Users\Prathamesh Kale\AppData\Roaming\Python\Python39\site-packages\tornado\platform\asyncio.py", line 205, in start

  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\asyncio\base_events.py", line 601, in run_forever

  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\asyncio\base_events.py", line 1905, in _run_once

  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\asyncio\events.py", line 80, in _run

  File "C:\Users\Prathamesh Kale\AppData\Roaming\Python\Python39\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue

  File "C:\Users\Prathamesh Kale\AppData\Roaming\Python\Python39\site-packages\ipykernel\kernelbase.py", line 534, in process_one

  File "C:\Users\Prathamesh Kale\AppData\Roaming\Python\Python39\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell

  File "C:\Users\Prathamesh Kale\AppData\Roaming\Python\Python39\site-packages\ipykernel\ipkernel.py", line 362, in execute_request

  File "C:\Users\Prathamesh Kale\AppData\Roaming\Python\Python39\site-packages\ipykernel\kernelbase.py", line 778, in execute_request

  File "C:\Users\Prathamesh Kale\AppData\Roaming\Python\Python39\site-packages\ipykernel\ipkernel.py", line 449, in do_execute

  File "C:\Users\Prathamesh Kale\AppData\Roaming\Python\Python39\site-packages\ipykernel\zmqshell.py", line 549, in run_cell

  File "C:\Users\Prathamesh Kale\AppData\Roaming\Python\Python39\site-packages\IPython\core\interactiveshell.py", line 3048, in run_cell

  File "C:\Users\Prathamesh Kale\AppData\Roaming\Python\Python39\site-packages\IPython\core\interactiveshell.py", line 3103, in _run_cell

  File "C:\Users\Prathamesh Kale\AppData\Roaming\Python\Python39\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner

  File "C:\Users\Prathamesh Kale\AppData\Roaming\Python\Python39\site-packages\IPython\core\interactiveshell.py", line 3308, in run_cell_async

  File "C:\Users\Prathamesh Kale\AppData\Roaming\Python\Python39\site-packages\IPython\core\interactiveshell.py", line 3490, in run_ast_nodes

  File "C:\Users\Prathamesh Kale\AppData\Roaming\Python\Python39\site-packages\IPython\core\interactiveshell.py", line 3550, in run_code

  File "C:\Users\Prathamesh Kale\AppData\Local\Temp\ipykernel_19128\3151768651.py", line 1, in <module>

  File "C:\Users\Prathamesh Kale\AppData\Local\Temp\ipykernel_19128\2671955950.py", line 4, in predict_password_strength

  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 562, in predict

  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 259, in one_step_on_data_distributed

  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 249, in one_step_on_data

  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 104, in predict_step

  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\layers\layer.py", line 908, in __call__

  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\ops\operation.py", line 46, in __call__

  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\utils\traceback_utils.py", line 156, in error_handler

  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\models\functional.py", line 182, in call

  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\ops\function.py", line 171, in _run_through_graph

  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\models\functional.py", line 637, in call

  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\layers\layer.py", line 908, in __call__

  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\ops\operation.py", line 46, in __call__

  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\utils\traceback_utils.py", line 156, in error_handler

  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\layers\core\embedding.py", line 140, in call

  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\ops\numpy.py", line 5346, in take

  File "c:\Users\Prathamesh Kale\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\src\backend\tensorflow\numpy.py", line 2093, in take

indices[0,0] = 30756 is not in [0, 30000)
	 [[{{node functional_1_1/embedding_1_1/GatherV2}}]] [Op:__inference_one_step_on_data_distributed_922]

In [14]:
def rules1(features):
    flag = True
    features_list = features.tolist()
    features_list = [item for sublist in features_list for item in sublist]
    if features_list[0] < 8 : 
        return [not flag , 'Password should have minimum length of 8 ']
    for i in range(1,5):
        if features_list[i] < 1 : 
            return [not flag , 'Password should contain 1 characters form all 4 letters group']
    else :
        return flag

In [15]:
def rules2(password: str) -> bool:
    """
    Rule 6: Prohibit Sequential Characters
    Returns True if the password does NOT contain sequences like 'abc', '123'.
    (Basic check: look for ascending sequences of length 3.)
    """
    # Simple approach: check each triplet
    for i in range(len(password) - 2):
        c1, c2, c3 = password[i], password[i+1], password[i+2]
        # Check ascending sequence in ASCII
        if ord(c2) == ord(c1) + 1 and ord(c3) == ord(c2) + 1:
            return True
    return False

In [16]:
def rules3(password: str, max_repeats: int = 2) -> bool:
    """
    Rule 7: Prohibit Repetitive Characters
    Returns True if no character repeats more than 'max_repeats' times in a row.
    """
    count = 1
    for i in range(1, len(password)):
        if password[i] == password[i-1]:
            count += 1
            if count > max_repeats:
                return True
        else:
            count = 1
    return False

In [31]:
rules3('aaa')

True

In [17]:
common = vocab_set[:10000]
common = common[common['word'].str.len() > 1]
common_eng_vocab = common.to_dict()
nested_vocab = common_eng_vocab['word']
flat_vocab_dict = {token: index for index, token in nested_vocab.items()}
import json
my_dict = flat_vocab_dict
json_output = json.dumps(my_dict, indent=4)  # indent for pretty printing
# print(json_output)

In [18]:
with open('top10000common.json', 'w') as json_file:
    json.dump(my_dict, json_file, indent=4)
# print(json_output)

In [19]:
new_vocab_df.to_csv('vocab_tier.csv', index=False)

In [20]:
with open("input/top10000common.json" , "r",encoding = "utf-8")as file:
    common_words = json.load(file)

In [21]:
def rules4(password: str, dictionary_list=None) -> bool:
    if dictionary_list is None:
        dictionary_list = common_words
    lower_pass = password.lower()
    for word in dictionary_list:
        if word in lower_pass:
            return [True , word]
    return False

In [22]:
def rules5(password: str, dictionary_list=None) -> bool:
    if dictionary_list is None:
        dictionary_list = common_words
    # Very simple substitution map
    subs = str.maketrans({
    "@": "a",
    "0": "o",
    "1": "l",
    "3": "e",
    "$": "s",
    "4": "a",
    "5": "s",
    "7": "t",
    "8": "b",
    "9": "g",
    "|": "l",
    "!": "i",
    "(": "c",
    ")": "d",
    "{": "c",
    "}": "c",
    "[": "c",
    "]": "c",
    "+": "t",
    "²": "2",
    "6": "b",
    "&": "and",
    "¥": "y",
    "€": "e",
    "#": "h",
    "%": "x",
    "^": "v",
    "<": "c",
    ">": "r",
    "÷": "/",
    "×": "x"
})
    normalized = password.lower().translate(subs)
    for word in dictionary_list:
        if word in normalized:
            return True
    return False

In [23]:
common_password = pd.read_csv('input/rockyou.txt' ,encoding='latin-1',  on_bad_lines = 'skip')
common_password[:10000].to_json('input/top10000password.json')
# common_password[:10000]

In [24]:
common_password_df = pd.read_json('input/top10000password.json')
common_password_list = common_password_df.values.tolist()
common_password_list = common_password_df[123456].values.tolist()

In [25]:
def rules6(password: str, common_passwords=None) -> bool:
    if common_passwords is None:
        common_passwords = common_password_list
    return password.lower() in common_passwords

In [26]:
def rules7(password: str, personal_data=None) -> bool:
    if personal_data is None:
        personal_data = {"john", "doe", "2023", "1990"}  # Example placeholders
    lower_pass = password.lower()
    for info in personal_data:
        if info in lower_pass:
            return True
    return False

In [27]:
def rules8(password: str, username: str = "") -> bool:
    return username.lower() in password.lower()

In [28]:
KEYBOARD_LAYOUT = [
    # Row 0
    ['`','1','2','3','4','5','6','7','8','9','0','-','='],
    # Row 1
    ['Q','W','E','R','T','Y','U','I','O','P','[',']','\\'],
    # Row 2
    ['A','S','D','F','G','H','J','K','L',';','\''],
    # Row 3
    ['Z','X','C','V','B','N','M',',','.','/']
]

In [29]:
def build_key_coords(layout=KEYBOARD_LAYOUT):
    key_coords = {}
    for r, row_keys in enumerate(layout):
        for c, key in enumerate(row_keys):
            key_coords[key.upper()] = (r, c)   
            key_coords[key.lower()] = (r, c)    
    return key_coords


In [30]:
KEY_COORDS = build_key_coords()


In [31]:
def build_adjacency_map(key_coords):
    adjacency_map = {}
    all_keys = list(key_coords.keys())
    
    for k in all_keys:
        (r1, c1) = key_coords[k]
        neighbors = set()
        for k2 in all_keys:
            if k2 == k:
                continue
            (r2, c2) = key_coords[k2]
            dist = math.dist((r1, c1), (r2, c2))
            if dist < math.sqrt(2):
                neighbors.add(k2)
        adjacency_map[k] = neighbors
    
    return adjacency_map

In [32]:
ADJACENCY_MAP = build_adjacency_map(KEY_COORDS)

In [33]:
def slope_between(k1, k2, key_coords):
    (r1, c1) = key_coords[k1]
    (r2, c2) = key_coords[k2]
    dr = r2 - r1
    dc = c2 - c1
    if dr == 0 and dc == 0:
        return (0, 0)
    # reduce to gcd
    g = math.gcd(dr, dc)
    dr //= g
    dc //= g
    return (dr, dc)

In [34]:
def is_single_line_parallel(password, key_coords):
    if len(password) < 2:
        return True 
    first, second = password[0], password[1]
    if first not in key_coords or second not in key_coords:
        return False    
    base_slope = slope_between(first, second, key_coords)
    for i in range(len(password) - 1):
        c1, c2 = password[i], password[i+1]
        if c1 not in key_coords or c2 not in key_coords:
            return False
        if slope_between(c1, c2, key_coords) != base_slope:
            return False
    return True

In [35]:
def is_two_line_parallel(password, key_coords):
    L = len(password)
    if L % 2 != 0:
        return False  
    
    mid = L // 2
    line1 = password[:mid]
    line2 = password[mid:]
    

    if not is_single_line_parallel(line1, key_coords):
        return False
    if not is_single_line_parallel(line2, key_coords):
        return False
    

    if len(line1) < 2:

        return True
    
    slope1 = slope_between(line1[0], line1[1], key_coords)
    slope2 = slope_between(line2[0], line2[1], key_coords)
    return (slope1 == slope2)

In [36]:
def rules9(password: str) -> bool:
    password = password.strip()
    if len(password) < 2:
        return False  # trivial short password won't be considered an AP pattern
    
    # 1) Check adjacency
    #    if every consecutive pair is in adjacency map
    all_adj = True
    for i in range(len(password) - 1):
        c1, c2 = password[i], password[i+1]
        if (c1 not in ADJACENCY_MAP) or (c2 not in ADJACENCY_MAP[c1]):
            all_adj = False
            break
    if all_adj:
        return True
    
    # 2) Check single-line parallel
    if is_single_line_parallel(password, KEY_COORDS):
        return True
    if is_two_line_parallel(password, KEY_COORDS):
        return True
    
    return False

In [37]:
with open("input/data.json", "r") as file:
    keyboard_pattern_list = json.load(file)

In [38]:
def has_common_substring_of_length_n_or_more(s1: str, s2: str, n: int = 4) -> bool:
    """
    Returns True if s1 and s2 share any substring of length >= n.
    Otherwise False.
    
    Example:
      - s1 = "tgyh"
      - s2 = "tgyhuj"
      => They share "tgyh" (length 4) => returns True
      - s1 = "tgy"
      - s2 = "tgyhuj"
      => Longest common substring is "tgy" (length 3) => returns False
    """
    s1, s2 = s1.lower(), s2.lower()
    len1, len2 = len(s1), len(s2)
    
    # If either string is shorter than n, they can't have a substring of length >= n in common
    if len1 < n or len2 < n:
        return False

    # Naive approach: check all substring lengths from n up to min(len1, len2).
    # Return True as soon as we find a match.
    max_possible = min(len1, len2)
    for length in range(n, max_possible + 1):
        # Check every substring of s1 with this length
        for start in range(len1 - length + 1):
            sub = s1[start:start + length]
            if sub in s2:
                return True
    return False

In [39]:
def rules10(password: str, keyboard_patterns=None, min_length=4) -> bool:
    """
    Rule 10 (Enhanced):
    Returns True if 'password' shares a common substring of length >= min_length
    with any entry in 'keyboard_patterns'.
    
    Example:
      - keyboard_patterns = ["tgyhuj", "qwerty", ...]
      - password = "tgyh" => shares "tgyh" with "tgyhuj" => True
      - password = "tgy"  => shares "tgy" (length 3) => not >= 4 => False
    """
    if keyboard_patterns is None:
        keyboard_patterns = ["qwerty", "asdf", "zxcv", "tgyhuj"]

    for pattern in keyboard_patterns:
        if has_common_substring_of_length_n_or_more(password, pattern, n=min_length):
            return True
    return False

In [40]:
def rules11(password: str, dictionary_list=None) -> bool:
    """
    Rule 14: Reversed Dictionary Words
    Returns True if password does NOT contain reversed dictionary words.
    """
    if dictionary_list is None:
        dictionary_list = common_words
    lower_pass = password.lower()
    for word in dictionary_list:
        if word[::-1] in lower_pass:
            return True
    return False

In [41]:
def rules12(password: str) -> bool:
    """
    Rule 15: Year/Date Patterns
    Returns True if password does NOT contain typical 4-digit year patterns (e.g., 1990-2025).
    (Simple placeholder check.)
    """
    for year in range(1900, 2030):
        if str(year) in password:
            return True
    return False

In [42]:
def rules13(password: str) -> bool:

    half_len = len(password) // 2
    # Check if the first half is repeated in the second half
    if len(password) % 2 == 0:  # even length
        if password[:half_len] == password[half_len:]:
            return True
    return False

In [43]:
def rules14(password: str, entropy_threshold: float = 3.0) -> bool:
    """
    Rule 23: Entropy Estimation
    Returns True if estimated Shannon entropy >= entropy_threshold.
    (Simple placeholder calculation.)
    """
    if not password:
        return False
    freq = {}
    for char in password:
        freq[char] = freq.get(char, 0) + 1
    entropy = 0.0
    length = len(password)
    for count in freq.values():
        p = count / length
        entropy -= p * math.log2(p)
    return entropy <= entropy_threshold

In [44]:
# Load token_lookup.csv
import pandas as pd

token_table = pd.read_csv("input/token_lookup.csv")

# Clean and flatten the structure (assuming columns are ['Index', 'Key'])
if 'Unnamed: 0' in token_table.columns:
    token_table.drop(columns=['Unnamed: 0'], inplace=True)

# Create flat_token_to_index dictionary
flat_token_to_index = dict(zip(token_table['Token'], token_table['Index']))


In [96]:
# Final Script to Evaluate Password Strength Using ML Model and All Rules
import numpy as np
import json
import math
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

# === Utility Mappings ===
mapping = {'very_high': 4, 'high': 3, 'medium': 2, 'low': 1, 'very_low': 0, 'none': 1}
strength_labels = {0: 'Weak', 1: 'Medium', 2: 'Strong'}

# === Password Evaluation Runner ===
def run_all_rules(password, features, common_words, common_passwords, keyboard_patterns, personal_data=None, username=""):
    violated_rules = []
    feat = [item for sublist in features.tolist() for item in sublist] if hasattr(features, 'tolist') else features

    if feat[0] < 8:
        violated_rules.append("Length must be >= 8")
    if any(feat[i] < 1 for i in range(1, 5)):
        violated_rules.append("Must include all character types")
    if rules2(password): violated_rules.append("Sequential characters")
    if rules3(password): violated_rules.append("Repetitive characters")
    if rules4(password, common_words): violated_rules.append("Common dictionary word")
    if rules5(password, common_words): violated_rules.append("Obfuscated common word")
    if rules6(password, common_passwords): violated_rules.append("In common password list")
    if rules7(password, personal_data): violated_rules.append("Personal info used")
    if rules8(password, username): violated_rules.append("Username used")
    if rules9(password): violated_rules.append("Keyboard pattern")
    if rules10(password, keyboard_patterns): violated_rules.append("Keyboard sequence")
    if rules11(password, common_words): violated_rules.append("Reversed dictionary word")
    if rules12(password): violated_rules.append("Contains year")
    if rules13(password): violated_rules.append("Repeated halves")
    if rules14(password): violated_rules.append("Low entropy")

    return violated_rules

# === Evaluation Function ===
def evaluate_password(password, model, tokenizer, flat_token_to_index, extract_func, process_func, keyboard_patterns, common_words, common_passwords, personal_data={"john", "doe"}, username=""):
    print(f"\n🔐 Evaluating Password: {password}\n")

    feat = extract_func(password)
    feat = [mapping[item] if isinstance(item, str) and item in mapping else item for item in feat]
    features = np.array([feat[:10]], dtype=np.float32)  # ✅ FIXED: model expects 10 features
  # No slicing, use full 15 features


    seq_tokens = tokenizer.encode(password).tokens
    VOCAB_SIZE = 64  # 👈 This should match what your model was trained with
    indices = [min(flat_token_to_index.get(tok, 0), VOCAB_SIZE - 1) for tok in seq_tokens]

    padded_seq = process_func([indices], 61)
    padded_seq = np.array(padded_seq)

    preds = model.predict({'input_seq': padded_seq, 'input_eng': features}, verbose=0)
    pred_class = np.argmax(preds, axis=1)[0]
    label = strength_labels.get(pred_class, 'Unknown')

    violated = run_all_rules(password, features, common_words, common_passwords, keyboard_patterns, personal_data, username)
    rule_score = max(0, 100 - len(violated) * 5)

    result = {
        "Password": password,
        "ML Prediction": label,
        "Rule Violations": violated,
        "Rule-Based Score": rule_score,
        "Final Verdict": label if rule_score >= 70 else 'Weak'
    }
    return result

# === Example Usage ===
# Ensure these variables are loaded:
# - model (loaded ML model)
# - my_tokenizer (ByteLevelBPETokenizer)
# - flat_token_to_index (dictionary mapping token to index)
# - feature_extract (your extract function)
# - process_sequences (your padding function)
# - keyboard_pattern_list (list from data.json)
# - common_words (from top common vocab)
# - common_password_list (from rockyou)

result = evaluate_password(
    "5uMM3rDTR12#2024*Q", model, my_tokenizer, flat_token_to_index,
    feature_extract, process_sequences, keyboard_pattern_list,
    common_words, common_password_list, username="john"
)

import json
print(json.dumps(result, indent=2))



🔐 Evaluating Password: 5uMM3rDTR12#2024*Q

{
  "Password": "5uMM3rDTR12#2024*Q",
  "ML Prediction": "Weak",
  "Rule Violations": [
    "Common dictionary word",
    "Obfuscated common word",
    "Reversed dictionary word",
    "Contains year"
  ],
  "Rule-Based Score": 80,
  "Final Verdict": "Weak"
}


In [87]:
import numpy as np
import json
import math
from tensorflow.keras.models import load_model

# === Utility Mappings ===
mapping = {'very_high': 4, 'high': 3, 'medium': 2, 'low': 1, 'very_low': 0, 'none': 1}
strength_labels = {0: 'Weak', 1: 'Medium', 2: 'Strong'}

# === Placeholder Rule Functions ===
def rules2(p): return False
def rules3(p): return False
def rules4(p, common): return any(word in p.lower() for word in common)
def rules5(p, common): return False
def rules6(p, common): return p.lower() in common
def rules7(p, personal): return any(info.lower() in p.lower() for info in personal)
def rules8(p, username): return username.lower() in p.lower()
def rules9(p): return False
def rules10(p, patterns): return any(pattern in p for pattern in patterns)
def rules11(p, common): return any(word[::-1] in p.lower() for word in common)
def rules12(p): return any(str(y) in p for y in range(1900, 2101))
def rules13(p): return len(p) % 2 == 0 and p[:len(p)//2] == p[len(p)//2:]
def rules14(p): return len(set(p)) < 4  # very low entropy if <4 unique chars

# === Rule Evaluation ===
def run_all_rules(password, features, common_words, common_passwords, keyboard_patterns, personal_data=None, username=""):
    violated_rules = []
    feat = features.tolist()[0] if hasattr(features, 'tolist') else features

    if feat[0] < 8:
        violated_rules.append("Length must be >= 8")
    if any(feat[i] < 1 for i in range(1, 5)):
        violated_rules.append("Must include all character types")
    if rules2(password): violated_rules.append("Sequential characters")
    if rules3(password): violated_rules.append("Repetitive characters")
    if rules4(password, common_words): violated_rules.append("Common dictionary word")
    if rules5(password, common_words): violated_rules.append("Obfuscated common word")
    if rules6(password, common_passwords): violated_rules.append("In common password list")
    if rules7(password, personal_data): violated_rules.append("Personal info used")
    if rules8(password, username): violated_rules.append("Username used")
    if rules9(password): violated_rules.append("Keyboard pattern")
    if rules10(password, keyboard_patterns): violated_rules.append("Keyboard sequence")
    if rules11(password, common_words): violated_rules.append("Reversed dictionary word")
    if rules12(password): violated_rules.append("Contains year")
    if rules13(password): violated_rules.append("Repeated halves")
    if rules14(password): violated_rules.append("Low entropy")

    return violated_rules

# === Evaluation Function ===
def evaluate_password(password, model, tokenizer, flat_token_to_index, extract_func, process_func,
                      keyboard_patterns, common_words, common_passwords, personal_data={"john", "doe"}, username=""):

    print(f"\n🔐 Evaluating Password: {password}\n")

    # --- Feature Extraction ---
    feat = extract_func(password)
    feat = [mapping[item] if isinstance(item, str) and item in mapping else item for item in feat]
    features = np.array([feat[:10]], dtype=np.float32)

    # --- Tokenization ---
    seq_tokens = tokenizer.encode(password).tokens
    VOCAB_SIZE = getattr(tokenizer, 'vocab_size', 64)
    indices = [min(flat_token_to_index.get(tok, 0), VOCAB_SIZE - 1) for tok in seq_tokens]

    padded_seq = process_func([indices], 61)  # returns shape (1, 61)
    padded_seq = np.array(padded_seq, dtype=np.int32)

    # --- Prediction ---
    preds = model.predict({'input_seq': padded_seq, 'input_eng': features}, verbose=0)
    pred_class = np.argmax(preds, axis=1)[0]
    label = strength_labels.get(pred_class, 'Unknown')

    # --- Rule Violations ---
    violated = run_all_rules(password, features, common_words, common_passwords, keyboard_patterns, personal_data, username)
    rule_score = max(0, 100 - len(violated) * 5)

    if rule_score >= 85:
        final_verdict = "Strong"
    elif rule_score >= 76:
        final_verdict = "Medium"
    else:
        final_verdict = "Weak"

    result = {
        "Password": password,
        "ML_Prediction": label,
        "Rule_Violations": violated,
        "Rule_Based_Score": rule_score,
        "Final_Verdict": final_verdict

    }
    return result

# === Example Usage ===
# Ensure the following objects are already loaded in your environment:
# - model
# - my_tokenizer
# - flat_token_to_index
# - feature_extract
# - process_sequences
# - keyboard_pattern_list
# - common_words
# - common_password_list

result = evaluate_password(
    "pass", model, my_tokenizer, flat_token_to_index,
    feature_extract, process_sequences, keyboard_pattern_list,
    common_words, common_password_list, username="john"
)

print(json.dumps(result, indent=2))



🔐 Evaluating Password: pass

{
  "Password": "pass",
  "ML_Prediction": "Weak",
  "Rule_Violations": [
    "Length must be >= 8",
    "Must include all character types",
    "Common dictionary word",
    "Reversed dictionary word",
    "Low entropy"
  ],
  "Rule_Based_Score": 75,
  "Final_Verdict": "Weak"
}
