In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="ticks")

## Features

In [None]:
df = pd.read_csv('../input/passwordevaluation/dataset.csv', names=['score', 'password'])
df.head()

In [None]:
df.info()

In [None]:
import string, re

# Helper functions
def count(password, chartype):
    c = 0
    for char in password:
        if char in chartype:
            c += 1
    return c

def check_ascii(text):
    for each_char in text:
        if each_char not in string.ascii_lowercase + string.ascii_uppercase + string.digits + string.punctuation:
            return False
    else:
        return True
    

In [None]:
alpha_count = lambda password: len(re.findall("[a-zA-Z]", password))
digit_count = lambda password: len(re.findall("[0-9]", password))
special_count = lambda password: count(password, string.punctuation)

def letters_only(password):
    letter_count = count(password, string.ascii_lowercase + string.ascii_uppercase)

    if len(password) == letter_count:
        return 1
    return 0

def numbers_only(password):
    digit_count = count(password, string.digits)

    if len(password) == digit_count:
        return 1
    return 0

In [None]:
import math

def entropy(password):
    R = 0
    upper = 0
    lower = 0
    digit = 0
    special = 0

    for char in password:
        if char in string.ascii_uppercase:
            upper = 26
        elif char in string.ascii_lowercase:
            lower = 26
        elif char in string.digits:
            digit = 10
        elif char in string.punctuation:
            special = 32

    R = upper + lower + digit + special
    ent = math.log2(R ** (len(password)))
    return round(ent, 2)

In [None]:
# Unit tests
password = "P@ssw0rd" # This is not my password ;-)

print(alpha_count(password))
print(digit_count(password))
print(special_count(password))

print(letters_only(password))
print(numbers_only(password))

print(entropy(password))

In [None]:
# Remove passwords if they have characters other than the ones supported by ASCII encoding
df['password'] = df['password'].apply(str)
df['check'] = df['password'].apply(check_ascii)

In [None]:
df = df[df['check'] == True]

In [None]:
df.info() #14259510

In [None]:
df.drop('check', axis=1, inplace=True)

In [None]:
df['alpha_count'] = df['password'].apply(alpha_count)
df['digit_count'] = df['password'].apply(digit_count)
df['special_count'] = df['password'].apply(special_count)
df['letters_only'] = df['password'].apply(letters_only)
df['numbers_only'] = df['password'].apply(numbers_only)
df['entropy'] = df['password'].apply(entropy)

In [None]:
df.head()

In [None]:
df.to_csv('passwd-features.csv', index=False)

## EDA

In [None]:
passwd = pd.read_csv('./passwd-features.csv')
passwd.head()

In [None]:
passwd.info()

In [None]:
passwd.describe()

In [None]:
sns.countplot(x=passwd['score'])

In [None]:
plt.figure(figsize=(10,10))
sns.boxplot(x=passwd['score'], y=passwd['alpha_count'])

In [None]:
plt.figure(figsize=(10,10))
sns.boxplot(x=passwd['score'], y=passwd['digit_count'])

In [None]:
plt.figure(figsize=(10,10))
sns.boxplot(x=passwd['score'], y=passwd['special_count'])

In [None]:
sns.countplot(x='letters_only', hue='score', data=passwd)

In [None]:
sns.countplot(x='numbers_only', hue='score', data=passwd)