In [None]:
import pandas as pd

# Load the dataset
try:
    phish_data = pd.read_csv('phishing_site_urls.csv')
    print("CSV loaded successfully!")
    print(phish_data.head())  # Show first 5 rows
    print(phish_data.info())  # Check dataset structure
except Exception as e:
    print(f"Error loading CSV: {e}")


In [None]:
import pandas as pd

# Load the dataset
phish_data = pd.read_csv('C:/phishing 2/phishing_site_urls.csv')

# Check the first few rows of the dataset
print(phish_data.head())



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

# Example: Splitting the data into features and labels
X = phish_data['URL']  # Features (URL column)
y = phish_data['Label']  # Target (Label column)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the split
print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

# Vectorize the URLs using CountVectorizer
vectorizer = CountVectorizer(max_features=5000)  # Limit to top 5000 features for simplicity
X_train_count = vectorizer.fit_transform(X_train)
X_test_count = vectorizer.transform(X_test)

# Check shape of transformed data to verify it
print(f"Shape of X_train_count: {X_train_count.shape}")
print(f"Shape of X_test_count: {X_test_count.shape}")

# Example: Logistic Regression model
lr = LogisticRegression(max_iter=1000)  # Increased iterations if needed

# Train the model and print progress
print("Training the model...")
lr.fit(X_train_count, y_train)

# Checking the accuracy
accuracy = lr.score(X_test_count, y_test)
print(f"Test Accuracy: {accuracy}")


In [None]:
from sklearn.metrics import classification_report

# Predicting on the test set
y_pred = lr.predict(X_test_count)

# Generating classification report
print(classification_report(y_test, y_pred))


In [None]:
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)



In [None]:
print(phish_data.columns)


In [None]:
print(phish_data.head())


In [None]:
print(phish_data['Label'].isnull().sum())  # Should be 0 after filling


In [None]:
print(phish_data['Label'].unique())  # Should show [1] or [0] depending on what you assigned


In [None]:
phish_data['Label'] = 1  # Assigning '1' for phishing URLs


In [None]:
phish_data['Label'] = phish_data['URL'].str.contains('phish', case=False, na=False).astype(int)


In [None]:
# Check the first few rows to confirm the 'Label' column has been populated
print(phish_data.head())

# Check the unique values in the 'Label' column
print(phish_data['Label'].unique())


In [None]:
# Display URLs with label 1 (those that contain "phish")
print(phish_data[phish_data['Label'] == 1].head())


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the dataset
phish_data = pd.read_csv('phishing_site_urls.csv')  # Correct file path

# If needed, ensure the 'Label' column is binary (0 or 1)
phish_data['Label'] = phish_data['URL'].str.contains('phish', case=False, na=False).astype(int)

# Initialize the CountVectorizer
vectorizer = CountVectorizer(max_features=5000)

# Vectorize the URLs
X = vectorizer.fit_transform(phish_data['URL'])

# Get the corresponding labels
y = phish_data['Label']

# Split the data into training and testing sets
trainX, testX, trainY, testY = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
model = LogisticRegression()

# Train the model
model.fit(trainX, trainY)

# Predict the labels for the test set
predictions = model.predict(testX)

# Calculate the accuracy of the model
accuracy = accuracy_score(testY, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")


In [None]:
print(phish_data['Label'].value_counts())


In [None]:
pip install imbalanced-learn


In [None]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE for oversampling the minority class
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

print(f"Resampled class distribution: {pd.Series(y_resampled).value_counts()}")


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Split the data into features and labels
X_resampled = vectorizer.fit_transform(phish_data['URL'])  # Using CountVectorizer
y_resampled = phish_data['Label']

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize and train a Logistic Regression model
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

# Predict on the test set
y_pred = lr.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Show classification report
print(classification_report(y_test, y_pred))


In [None]:
print(X_train.shape)  # Check the shape of training features
print(y_train.shape)  # Check the shape of training labels


In [None]:
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the RandomForestClassifier with balanced class weights
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', verbose=2)
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Show classification report
print(classification_report(y_test, y_pred))



In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Apply SMOTE for oversampling the minority class
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

# Initialize and train the RandomForestClassifier with balanced class weights
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', max_depth=10, n_jobs=-1)
rf.fit(X_res, y_res)

# Predict on the test set
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Show classification report
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train the RandomForestClassifier with adjusted class weights
rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight={0: 1, 1: 50}, max_depth=10, n_jobs=-1)
rf.fit(X_res, y_res)

# Predict probabilities
y_probs = rf.predict_proba(X_test)[:, 1]

# Apply a threshold of 0.3 instead of 0.5 to increase recall for the minority class
y_pred = (y_probs > 0.3).astype(int)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Show classification report
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize RandomForest with balanced class weights and max_depth to avoid overfitting
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', max_depth=10, n_jobs=-1)

# Use cross-validation to better estimate performance
cv_scores = cross_val_score(rf, X_res, y_res, cv=5, scoring='accuracy')

# Train the model on the whole dataset for final evaluation
rf.fit(X_res, y_res)

# Predict probabilities and adjust the threshold
y_probs = rf.predict_proba(X_test)[:, 1]
y_pred = (y_probs > 0.2).astype(int)  # Adjust threshold as necessary

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Show classification report
print(classification_report(y_test, y_pred))

# Print cross-validation results
print(f'Cross-validation accuracy scores: {cv_scores}')
print(f'Mean CV accuracy: {cv_scores.mean() * 100:.2f}%')


In [None]:
param_grid = {
    'n_estimators': [50, 100],  # Start with fewer trees
    'max_depth': [10, None],     # Limit the depth of trees
    'class_weight': ['balanced']  # Test class_weight
}
