# Preprocessing and EDA

In [88]:
# Setup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import regex as re
from sklearn.feature_extraction.text import TfidfVectorizer

## 1. Creating Features

In [89]:
data = pd.read_csv('PWdata.csv', on_bad_lines='skip')

# Inital cleaning
data = data.dropna().reset_index()  # removed 1 row
data.password = data.password.astype(str)

print('nrows: {}'.format(data.size))
data.head(4)

nrows: 2008917


Unnamed: 0,index,password,strength
0,0,kzde5577,1
1,1,kino3434,1
2,2,visi7k1yr,1
3,3,megzy123,1


In [90]:
# Create basic features from password string
data['length'] = data.password.str.len()  # length of password
data['ratio_lower'] = data.password.apply(lambda pw: len(re.findall(r'[a-z]', pw))) / data.length  # ratio of lowercase letters
data['ratio_upper'] = data.password.apply(lambda pw: len(re.findall(r'[A-Z]', pw))) / data.length  # ratio of uppercase letters
data['ratio_numbers'] = data.password.apply(lambda pw: len(re.findall(r'[0-9]', pw)))  / data.length  # ratio of numeric characters
data['ratio_special'] = data.password.str.len() - data.password.apply(lambda pw: len(re.findall(r'[a-zA-Z0-9]', pw)))  # ratio of special characters
data['ratio_unique'] = data.password.apply(lambda pw: len(set(pw))) / data.length  # ratio of unique characters
data.head(3)

Unnamed: 0,index,password,strength,length,ratio_lower,ratio_upper,ratio_numbers,ratio_special,ratio_unique
0,0,kzde5577,1,8,0.5,0.0,0.5,0,0.75
1,1,kino3434,1,8,0.5,0.0,0.5,0,0.75
2,2,visi7k1yr,1,9,0.777778,0.0,0.222222,0,0.888889


In [91]:
# Calculate tfidf score of letters
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 1), lowercase=False) 
vectorizer.fit(data.password)
tfidf_matrix = vectorizer.transform(data.password).toarray()

tfidf_df = pd.DataFrame(data=tfidf_matrix, columns=vectorizer.get_feature_names_out())
data_tfidf = pd.concat([data, tfidf_df], axis=1)

# Check sizes
print('tfidf_matrix.shape: {}'.format(tfidf_matrix.shape))
print('data.shape: {}'.format(data.shape))
print('tfidf_df.shape: {}'.format(tfidf_df.shape))
print('data_tfidf.shape: {}'.format(data_tfidf.shape))

print(vectorizer.get_feature_names_out())
print(len(vectorizer.get_feature_names_out()))

tfidf_matrix.shape: (669639, 200)
data.shape: (669639, 9)
tfidf_df.shape: (669639, 200)
data_tfidf.shape: (669639, 209)
['\x01' '\x02' '\x04' '\x05' '\x06' '\x08' '\x0e' '\x0f' '\x10' '\x11'
 '\x12' '\x13' '\x16' '\x17' '\x18' '\x19' '\x1b' '\x1c' '\x1d' '\x1e' ' '
 '!' '"' '#' '$' '%' '&' '(' ')' '*' '+' '-' '.' '/' '0' '1' '2' '3' '4'
 '5' '6' '7' '8' '9' ';' '<' '=' '>' '?' '@' 'A' 'B' 'C' 'D' 'E' 'F' 'G'
 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W' 'X' 'Y'
 'Z' '[' '\\' ']' '^' '_' '`' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k'
 'l' 'm' 'n' 'o' 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z' '{' '|' '}'
 '~' '\x7f' '\x81' '\x8d' '\xa0' '¡' '¢' '¤' '¦' '§' '¨' '«' '¯' '°' '±'
 '²' '³' '´' 'µ' '¶' '·' '¹' 'º' '»' '¼' '½' '¾' '¿' 'À' 'Á' 'Â' 'Ã' 'Ä'
 'Å' 'Æ' 'Ç' 'É' 'Ê' 'Í' 'Ï' 'Ð' 'Ñ' 'Ò' 'Ó' 'Ô' 'Õ' 'Ö' '×' 'Ù' 'Ú' 'Û'
 'Ü' 'Ý' 'Þ' 'ß' 'à' 'á' 'â' 'ä' 'å' 'æ' 'è' 'é' 'ê' 'í' 'î' 'ï' 'ð' 'ñ'
 'ò' 'ó' 'õ' 'ö' '÷' 'ù' 'ú' 'û' 'ý' 'þ' 'œ' 'Ÿ' 'ƒ' '—' '‚' '‡' '…' '‹'
 '›

In [92]:
data_tfidf.head()

Unnamed: 0,index,password,strength,length,ratio_lower,ratio_upper,ratio_numbers,ratio_special,ratio_unique,,...,œ,Ÿ,ƒ,—,‚,‡,…,‹,›,™
0,0,kzde5577,1,8,0.5,0.0,0.5,0,0.75,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,kino3434,1,8,0.5,0.0,0.5,0,0.75,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,visi7k1yr,1,9,0.777778,0.0,0.222222,0,0.888889,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,megzy123,1,8,0.625,0.0,0.375,0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,lamborghin1,1,11,0.909091,0.0,0.090909,0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Summary statistics and plots