In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier



In [2]:
data = pd.read_csv("data.csv", error_bad_lines = False)
data.head()

b'Skipping line 2810: expected 2 fields, saw 5\nSkipping line 4641: expected 2 fields, saw 5\nSkipping line 7171: expected 2 fields, saw 5\nSkipping line 11220: expected 2 fields, saw 5\nSkipping line 13809: expected 2 fields, saw 5\nSkipping line 14132: expected 2 fields, saw 5\nSkipping line 14293: expected 2 fields, saw 5\nSkipping line 14865: expected 2 fields, saw 5\nSkipping line 17419: expected 2 fields, saw 5\nSkipping line 22801: expected 2 fields, saw 5\nSkipping line 25001: expected 2 fields, saw 5\nSkipping line 26603: expected 2 fields, saw 5\nSkipping line 26742: expected 2 fields, saw 5\nSkipping line 29702: expected 2 fields, saw 5\nSkipping line 32767: expected 2 fields, saw 5\nSkipping line 32878: expected 2 fields, saw 5\nSkipping line 35643: expected 2 fields, saw 5\nSkipping line 36550: expected 2 fields, saw 5\nSkipping line 38732: expected 2 fields, saw 5\nSkipping line 40567: expected 2 fields, saw 5\nSkipping line 40576: expected 2 fields, saw 5\nSkipping line 

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1


In [3]:
data.shape

(669640, 2)

In [4]:
data.isna().sum()

password    1
strength    0
dtype: int64

In [5]:
data['strength'].value_counts()

1    496801
0     89702
2     83137
Name: strength, dtype: int64

In [6]:
data[data['password'].isnull()]

Unnamed: 0,password,strength
367579,,0


In [7]:
data.dropna(inplace = True)

In [8]:
data.isna().sum()

password    0
strength    0
dtype: int64

In [9]:
data.shape

(669639, 2)

- The dataset has two columns; password and strength. In the strength column

- 0 means: the password’s strength is weak;
- 1 means: the password’s strength is medium;
- 2 means: the password’s strength is strong;
- Before moving forward, I will convert 0, 1, and 2 values in the strength column to weak, medium, and strong

In [10]:
data["strength"] = data["strength"].map({0: "Weak", 
                                         1: "Medium",
                                         2: "Strong"})
data.sample(5)

Unnamed: 0,password,strength
50598,tiger77,Weak
395167,san9XxjIyMQTPHzV,Strong
487042,5i1d8xqmkf,Medium
599783,dusgmani2910,Medium
61692,M08888194,Medium


In [11]:
data[data['strength'] == "Strong"]

Unnamed: 0,password,strength
5,AVYq1lDE4MgAZfNt,Strong
13,WUt9IZzE0OQ7PkNE,Strong
20,elyass15@ajilent-ci,Strong
22,klara-tershina3H,Strong
41,pHyqueDIyNQ8vmhb,Strong
...,...,...
669618,juanpaganini588@gmail.com,Strong
669619,tYAam8zg3Mg2AZ7a,Strong
669622,weslley.06888524,Strong
669627,sakaryal&#305;,Strong


# Password Strength Prediction Model
- Now I'll move to train a machine learning model to predict the strength of the password. Before I start preparing the model, we need to tokenize the passwords as I need the model to learn from the combinations of digits, letters, and symbols to predict the password’s strength. So here’s how I can tokenize and split the data into training and test sets:

In [12]:
def word(password):
    character=[]
    for i in password:
        character.append(i)
    return character

In [13]:
x = np.array(data["password"])
y = np.array(data["strength"])

In [14]:
tdif = TfidfVectorizer(tokenizer=word)
x = tdif.fit_transform(x)
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.05, random_state=42)



- Training a classification model

In [15]:
model = RandomForestClassifier()
model.fit(xtrain, ytrain)
print(model.score(xtest, ytest))

0.9565438145869423


## Now here’s how we can check the strength of a password using the trained model:

In [31]:
import getpass
user = getpass.getpass("Enter Password of 8-25 Characters:")
data = tdif.transform([user]).toarray()
output = model.predict(data)
print(output)

Enter Password of 8-25 Characters:········
['Strong']
