In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the dataset
df = pd.read_csv("/content/Final_ML.csv")  # Change to correct file path if needed

# Define classification function
def classify_data(value):
    if isinstance(value, (int, float)) and not pd.isna(value):
        return "Numeric"
    elif isinstance(value, str):
        if value.isalpha():
            return "Alphabetic"
        elif value.isalnum():
            return "Alphanumeric"
        elif any(c.isdigit() for c in value):
            return "Mixed"
        else:
            return "Other"
    return "Unknown"

# Apply classification to Source and Destination
df['Source_Type'] = df['Source'].astype(str).apply(classify_data)
df['Destination_Type'] = df['Destination'].astype(str).apply(classify_data)

# Encode categorical features
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in ['Source_Type', 'Destination_Type', 'Label']:
    df[col] = le.fit_transform(df[col])

# Feature Engineering
df['Source_Length'] = df['Source'].astype(str).apply(len)
df['Destination_Length'] = df['Destination'].astype(str).apply(len)
df['Source_Digit_Count'] = df['Source'].astype(str).apply(lambda x: sum(c.isdigit() for c in x))
df['Destination_Digit_Count'] = df['Destination'].astype(str).apply(lambda x: sum(c.isdigit() for c in x))
df['Source_Alpha_Count'] = df['Source'].astype(str).apply(lambda x: sum(c.isalpha() for c in x))
df['Destination_Alpha_Count'] = df['Destination'].astype(str).apply(lambda x: sum(c.isalpha() for c in x))
df['Source_Has_Special'] = df['Source'].astype(str).apply(lambda x: int(any(not c.isalnum() for c in x)))
df['Destination_Has_Special'] = df['Destination'].astype(str).apply(lambda x: int(any(not c.isalnum() for c in x)))

# Features and target
X = df[['Source_Type', 'Destination_Type', 'Source_Length', 'Destination_Length',
        'Source_Digit_Count', 'Destination_Digit_Count', 'Source_Alpha_Count',
        'Destination_Alpha_Count', 'Source_Has_Special', 'Destination_Has_Special']]
y = df['Label']

# Standardize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the Neural Network model
model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(len(y.unique()), activation='softmax')  # Output layer
])

# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), verbose=1)

# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Neural Network Accuracy: {accuracy * 100:.2f}%")


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load dataset
file_path = "/content/dataset_alpha.csv"  # Update this with your file path
df = pd.read_csv(file_path)

# Combine text columns
text_features = df[['Source', 'Destination']].astype(str).agg(' '.join, axis=1)
labels = df['Label'].astype(str)

# Encode labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize text features
encoded_inputs = tokenizer(list(text_features), padding=True, truncation=True, return_tensors="pt", max_length=128)

# Extract BERT embeddings
with torch.no_grad():
    outputs = bert_model(**encoded_inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # Use [CLS] token representation

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels_encoded, test_size=0.2, random_state=42)

# Train a Random Forest classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

# Predict on test data
y_pred = classifier.predict(X_test)

# Compute accuracy
accuracy = accuracy_score(y_test, y_pred)

# Print accuracy
print(f"BERT + Random Forest Classification Accuracy: {accuracy:.4f}")


In [None]:
# read the /content/extracted_data.csv file
import pandas as pd
df = pd.read_csv('/content/extracted_data.csv')
df.head()

In [None]:
#add the null values with 0
df.fillna(0, inplace=True)

In [None]:
#To find the count of null values
df.isnull().sum()

In [None]:
#save the file
df.to_csv('Final_ML.csv', index=False)