In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import random

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

import warnings
warnings.filterwarnings('ignore')

## Data Visualization

In [None]:
data = pd.read_csv('/kaggle/input/password-strength-classifier/data.csv', error_bad_lines=False)
data.head()

In [None]:
# 0 = weak, 1=average, 2=strong
data['strength'].unique()

In [None]:
data.isnull().sum()

In [None]:
data[data['password'].isnull()]

In [None]:
data.dropna(inplace=True)
# data.isnull().sum()

In [None]:
# check if data is imbalanced
sns.countplot(data['strength'])

In [None]:
password_tuple = np.array(data)
password_tuple

In [None]:
random.shuffle(password_tuple)

In [None]:
x = [labels[0] for labels in password_tuple]
y = [labels[1] for labels in password_tuple]

In [None]:
len(x)

## Data Cleaning

In [None]:
# convert into characters to pass it into tfidf
def word_divide_char(inputs):
    character=[]
    for i in inputs:
        character.append(i)
    return character

In [None]:
word_divide_char(x[0])

## TF-IDF

In [None]:
vectorizer = TfidfVectorizer(tokenizer=word_divide_char)

In [None]:
X = vectorizer.fit_transform(x)

In [None]:
X.shape

In [None]:
vectorizer.get_feature_names() # index
first_doc_vec = X[0]
first_doc_vec

In [None]:
first_doc_vec.T.todense()

In [None]:
df = pd.DataFrame(first_doc_vec.T.todense(), index=vectorizer.get_feature_names(), columns=['TF-IDF'])
df.sort_values(by=['TF-IDF'], ascending=False)

## Model Training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train.shape

In [None]:
clf = LogisticRegression(random_state=0, solver='saga', multi_class='multinomial')

{'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'},

In [None]:
clf.fit(X_train, y_train)

## Evaluation

In [None]:
# on X_test
y_pred = clf.predict(X_test)
y_pred

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
acc = accuracy_score(y_test, y_pred)
acc

In [None]:
cr = classification_report(y_test, y_pred)
print(cr)

In [None]:
# predict on new data
dt = np.array(['@kagglE29'])
pred = vectorizer.transform(dt)
print(clf.predict(pred))

In [None]:
def password_strength(pw):
    dt = np.array([pw])
    pw = vectorizer.transform(dt)
    strength = clf.predict(pw)
    print(strength)

In [None]:
password_strength('8988')