#  Password Strength Classification

This project focuses on building machine learning models to **predict the strength of passwords**—categorizing them as *weak*, *medium*, or *strong* based on their characteristics. This is particularly useful for applications involving user authentication and cybersecurity to prevent weak password usage.

---

##  What I Did

- **Data Cleaning & Preprocessing**:  
  Loaded and processed a dataset of user passwords, handling null values and removing invalid entries.

- **Feature Engineering**:  
  Extracted features like password length, presence of digits, special characters, uppercase/lowercase letters, etc.

- **Label Encoding**:  
  Converted password strength labels (*weak*, *medium*, *strong*) into numerical classes for modeling.

- **Vectorizing**:
  Convert the passwords into vectors by using TF-IDF Vectorizer

- **Model Training**:  
  Trained multiple classification models including:
  - Logistic Regression  
  - XGBClassifier
  - Multinomial Navie Bayes

- **Model Evaluation**:  
  Compared models using **accuracy** to identify the best-performing approach.

---

##  Results

Among the models tested, the **XGBClassifier** gave the highest accuracy in classifying password strength accurately, due to its robustness in handling a wide variety of feature types.


In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv("/content/data (1).csv", sep=',', on_bad_lines='skip')

In [None]:
data.head()

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1


In [None]:
data['password'].iloc[234000]

'ad34s11e'

In [None]:
data[data['password'].isnull()]

Unnamed: 0,password,strength
367579,,0


In [None]:
data.dropna(inplace = True)


In [None]:
data['password'].duplicated().sum()

0

In [None]:
tup = data.values

In [None]:
tup

array([['kzde5577', 1],
       ['kino3434', 1],
       ['visi7k1yr', 1],
       ...,
       ['184520socram', 1],
       ['marken22a', 1],
       ['fxx4pw4g', 1]], dtype=object)

In [None]:
from random import shuffle

shuffle(tup)

In [None]:
tup

array([['kzde5577', 1],
       ['kzde5577', 1],
       ['kzde5577', 1],
       ...,
       ['plygi1', 0],
       ['d0rem1', 0],
       ['bali437705', 1]], dtype=object)

In [None]:
x = [i[0] for i in tup]

In [None]:
x

['kzde5577',
 'kzde5577',
 'kzde5577',
 'kino3434',
 'megzy123',
 'megzy123',
 'megzy123',
 'visi7k1yr',
 'u6c8vhow',
 'AVYq1lDE4MgAZfNt',
 'v1118714',
 'asv5o9yu',
 'as326159',
 'jytifok873',
 'lamborghin1',
 'g067057895',
 'AVYq1lDE4MgAZfNt',
 'megzy123',
 'u6c8vhow',
 'lamborghin1',
 'visi7k1yr',
 '52558000aaa',
 'kzde5577',
 'megzy123',
 'kino3434',
 'as326159',
 'g067057895',
 'czuodhj972',
 'elyass15@ajilent-ci',
 'lamborghin1',
 'gaymaids1',
 'lamborghin1',
 'asgaliu11',
 'trabajonet9',
 'megzy123',
 '0169395484a',
 'visi7k1yr',
 'kino3434',
 'lsdlsd1',
 'v1118714',
 'go7kew7a2po',
 'WUt9IZzE0OQ7PkNE',
 '6975038lp',
 'memjan123',
 'pHyqueDIyNQ8vmhb',
 '6975038lp',
 'g067057895',
 'elyass15@ajilent-ci',
 'kino3434',
 'yqugu927',
 '52558000aaa',
 'kswa2mrv',
 'lsdlsd1',
 'jytifok873',
 'alimagik1',
 'TyWM72UNEex8Q8Y',
 'g067057895',
 '6975038lp',
 'juliel009',
 'schalke04',
 'kswa2mrv',
 '612035180tok',
 '6975038lp',
 'asv5o9yu',
 'tamanagung6',
 'fk9qi21m',
 'elyass15@ajilent-ci'

In [None]:
y = [j[1] for j in tup]

In [None]:
y

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 0,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 0,
 2,
 1,
 1,
 1,
 0,
 0,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 2,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 2,
 2,
 1,
 2,
 1,
 0,
 1,
 0,
 1,
 0,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 0,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,


In [None]:
data['strength'].value_counts()

Unnamed: 0_level_0,count
strength,Unnamed: 1_level_1
1,496801
0,89701
2,83137


In [None]:
def list_of_characters(ip):
  char = []
  for i in ip:
    char.append(i)
  return char

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(tokenizer = list_of_characters)
vector_ip = vectorizer.fit_transform(x)



In [None]:
vectorizer.vocabulary_

{'k': 58,
 'z': 73,
 'd': 51,
 'e': 52,
 '5': 31,
 '7': 33,
 'i': 56,
 'n': 61,
 'o': 62,
 '3': 29,
 '4': 30,
 'm': 60,
 'g': 54,
 'y': 72,
 '1': 27,
 '2': 28,
 'v': 69,
 's': 66,
 'r': 65,
 'u': 68,
 '6': 32,
 'c': 50,
 '8': 34,
 'h': 55,
 'w': 70,
 'a': 48,
 'q': 64,
 'l': 59,
 'f': 53,
 't': 67,
 '9': 35,
 'j': 57,
 'b': 49,
 '0': 26,
 '@': 41,
 '-': 23,
 'p': 63,
 'x': 71,
 '.': 24,
 '&': 18,
 '?': 40,
 '>': 39,
 '<': 37,
 '!': 13,
 '_': 46,
 ';': 36,
 '±': 84,
 '%': 17,
 '(': 19,
 ')': 20,
 ' ': 12,
 '+': 22,
 '$': 16,
 '#': 15,
 '`': 47,
 '{': 74,
 '}': 76,
 'ô': 110,
 '\\': 43,
 '^': 45,
 '~': 77,
 '/': 25,
 '*': 21,
 '=': 38,
 '[': 42,
 '\x1c': 10,
 'ú': 114,
 '"': 14,
 ']': 44,
 '³': 86,
 '\x16': 7,
 'ó': 109,
 'ò': 108,
 '·': 89,
 '\x1e': 11,
 '\x19': 9,
 'ä': 99,
 'ß': 95,
 'ð': 106,
 'å': 100,
 'â': 98,
 '°': 83,
 'ÿ': 119,
 '|': 75,
 '¿': 93,
 '´': 87,
 'þ': 118,
 '«': 82,
 '²': 85,
 '\x08': 2,
 'ê': 103,
 'á': 97,
 '¡': 79,
 'õ': 111,
 '\x17': 8,
 '÷': 113,
 'º': 91,
 'û'

In [None]:
vector_ip.shape

(669639, 122)

In [None]:
dense_vector = vector_ip.toarray()

In [None]:
dense_vector[0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.56763043, 0.        , 0.59129491, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.28560555, 0.22112519, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.29134827, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.33587994, 0.     

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(dense_vector, y, test_size = 0.2, random_state = 42)

In [None]:
x_train.shape

(535711, 122)

In [None]:
y_train = np.array(y_train)
y_train.shape

(535711,)

In [None]:
y_test = np.array(y_test)
y_test.shape

(133928,)

In [None]:
model = LogisticRegression(random_state = 0,multi_class = 'multinomial',solver = 'newton-cg')
model.fit(x_train, y_train)



In [None]:
model.score(x_test, y_test)

In [None]:
y_pred = "12345678"

y = vectorizer.transform([y_pred])

model.predict(y)

In [None]:
import xgboost as xgb

In [None]:
model_xgb = xgb.XGBClassifier()

model_xgb.fit(x_train, y_train)

model_xgb.score(x_test, y_test)

0.9803401827847799

In [None]:
from sklearn.metrics import accuracy_score

yPredtr = model_xgb.predict(x_train)
print(accuracy_score(y_train, yPredtr))


0.9846820393831749


In [None]:
ypredte = model_xgb.predict(x_test)
print(accuracy_score(y_test, ypredte))

0.9803401827847799


In [None]:
input = vectorizer.transform(['Jason@'])

model_xgb.predict(input)

array([2])

In [None]:
from sklearn.naive_bayes import MultinomialNB

model_mnb = MultinomialNB()

model_mnb.fit(x_train, y_train)

model_mnb.score(x_test, y_test)

0.7450869123708261

In [None]:
input = vectorizer.transform(['12345678'])

model_mnb.predict(input)

array([1])