In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Function to reduce high cardinality
def reduce_cardinality(X, threshold=0.01):
    for col in X.columns:
        if X[col].dtype == 'object':
            freq = X[col].value_counts(normalize=True)
            high_cardinality_labels = freq[freq > threshold].index
            X[col] = X[col].apply(lambda x: x if x in high_cardinality_labels else 'Other')
    return X

# Load the data
data = pd.read_csv('Darknet.csv')

# Combine Label1 and Label2 into a single label column
def combine_labels(row):
    if row['Label1'] in ['Non-Tor', 'NonVPN']:
        return 'Normal'
    else:
        return f'Darknet_{row["Label2"]}'

data['label'] = data.apply(combine_labels, axis=1)
data.drop(columns=['Label1', 'Label2'], inplace=True)

# Define features and target
X = data.drop(columns=['label'])
y = data['label']

# Convert Timestamp to datetime and extract features
X['Timestamp'] = pd.to_datetime(X['Timestamp'], format='%d/%m/%Y %I:%M:%S %p', errors='coerce')
X['Year'] = X['Timestamp'].dt.year
X['Month'] = X['Timestamp'].dt.month
X['Day'] = X['Timestamp'].dt.day
X['Hour'] = X['Timestamp'].dt.hour
X['Minute'] = X['Timestamp'].dt.minute
X['Second'] = X['Timestamp'].dt.second
X.drop(columns=['Timestamp'], inplace=True)

# Apply high cardinality reduction
X = reduce_cardinality(X)

# Identify numerical and categorical features
numerical_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()

# Replace infinities and excessively large values with NaN
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# Preprocessing pipeline for numerical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Preprocessing pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Fit and transform the training data, transform the test data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Debugging: Print the shape of the transformed data
print(f'Shape of X_train_transformed: {X_train_transformed.shape}')
print(f'Shape of X_test_transformed: {X_test_transformed.shape}')

# Debugging: Get the feature names after transformation
feature_names = preprocessor.get_feature_names_out()
print(f'Number of feature names: {len(feature_names)}')

# Convert transformed arrays back to DataFrames
X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=feature_names)
X_test_transformed_df = pd.DataFrame(X_test_transformed, columns=feature_names)

# Add the target column back to the transformed data
train_df = pd.concat([X_train_transformed_df, y_train.reset_index(drop=True)], axis=1)
test_df = pd.concat([X_test_transformed_df, y_test.reset_index(drop=True)], axis=1)

# Save the preprocessed data to CSV files
train_df.to_csv('train_preprocessed.csv', index=False)
test_df.to_csv('test_preprocessed.csv', index=False)

print("Preprocessed data saved to 'train_preprocessed.csv' and 'test_preprocessed.csv'.")
