In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.base import BaseEstimator, TransformerMixin
from urllib.parse import urlparse
from sklearn.metrics import accuracy_score, classification_report, f1_score

In [2]:
df = pd.read_csv('train.csv')
df

Unnamed: 0,id,FILENAME,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,CharContinuationRate,TLDLegitimateProb,...,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label
0,1,,https://www.northcm.ac.th,24.0,www.northcm.ac.th,17.0,0.0,,0.800000,,...,0.0,0.0,1.0,,3.0,,69.0,,,1
1,4,8135291.txt,http://uqr.to/1il1z,,,,,to,1.000000,0.000896,...,,0.0,0.0,,,,,,1.0,0
2,5,586561.txt,https://www.woolworthsrewards.com.au,35.0,www.woolworthsrewards.com.au,28.0,0.0,au,0.857143,,...,1.0,0.0,1.0,33.0,7.0,8.0,15.0,,2.0,1
3,6,,,31.0,,,,com,0.562500,0.522907,...,1.0,0.0,1.0,24.0,5.0,14.0,,,,1
4,11,412632.txt,,,www.nyprowrestling.com,22.0,0.0,,1.000000,,...,0.0,0.0,1.0,,,14.0,,0.0,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140399,235790,49490.txt,,,www.slavevoyages.org,,0.0,,1.000000,0.079963,...,,0.0,1.0,23.0,3.0,6.0,,12.0,,1
140400,235791,,https://www.greenmountainenergy.com,34.0,www.greenmountainenergy.com,,0.0,com,1.000000,0.522907,...,1.0,,1.0,26.0,,,169.0,15.0,40.0,1
140401,235792,,https://www.leadcastingcall.com,30.0,www.leadcastingcall.com,23.0,,,1.000000,0.522907,...,0.0,0.0,1.0,25.0,,,87.0,1.0,93.0,1
140402,235794,,https://www.fedarb.com,21.0,,14.0,,,1.000000,0.522907,...,0.0,0.0,1.0,,36.0,,102.0,,,1


In [3]:
# Identifying float columns (0, 1, or NaN)
binary_float_features = df.select_dtypes(include=['float']).columns
binary_like_features = binary_float_features[df[binary_float_features].isin([0, 1, float('nan')]).all()]

# Categorical columns + binary-like float columns
categorical_features = df.select_dtypes(include=['object', 'category']).columns
all_categorical_features = list(binary_like_features) + list(categorical_features)

all_categorical_features = [feature for feature in all_categorical_features if feature not in ['NoOfURLRedirect', 'NoOfSelfRedirect','FILENAME','URL', 'Domain', 'Title']]
all_categorical_features.append('label')

In [4]:
exclude_features = ['FILENAME', 'URL', 'Domain',  'Title', 'id']
numerical_features = [feature for feature in df.columns.difference(all_categorical_features) if feature not in exclude_features]

In [5]:
discrete_features = ['FILENAME', 'URL', 'Domain', 'Title', 'id']

In [6]:
categorical_features_df = pd.DataFrame(all_categorical_features, columns=['Categorical Features'])
categorical_features_df.index = range(1, len(categorical_features_df) + 1)

numerical_features_df = pd.DataFrame(numerical_features, columns=['Numerical Features'])
numerical_features_df.index = range(1, len(numerical_features_df) + 1)

text_features_df = pd.DataFrame(discrete_features, columns=['Text Features'])
text_features_df.index = range(1, len(text_features_df) + 1)

In [7]:
categorical_features_df

Unnamed: 0,Categorical Features
1,IsDomainIP
2,HasObfuscation
3,IsHTTPS
4,HasTitle
5,HasFavicon
6,Robots
7,IsResponsive
8,HasDescription
9,HasExternalFormSubmit
10,HasSocialNet


In [8]:
from sklearn.model_selection import train_test_split

# Save original training set
train_set_ori = df.copy()

# Split training set and validation set
train_set, val_set  = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

In [9]:
class HandleLength(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # No fitting process needed
        return self
    
    def transform(self, X):
        # Copy the DataFrame to avoid modifying the original
        X_transformed = X.copy()
        
        # Drop rows where URL is completely NaN
        X_transformed = X_transformed.dropna(subset=['URL'])
        
        # Impute Domain using URL if Domain is NaN
        def extract_domain(url):
            try:
                parsed_url = urlparse(url)
                return parsed_url.netloc if parsed_url.netloc else None
            except:
                return None

        X_transformed['Domain'] = X_transformed.apply(
            lambda row: extract_domain(row['URL']) if pd.isna(row['Domain']) else row['Domain'], axis=1
        )
        
        # Fill URLLength with the lengths of URL strings
        X_transformed['URLLength'] = X_transformed['URL'].apply(lambda x: len(str(x)) if pd.notna(x) else None)
        
        # Fill DomainLength with the lengths of Domain strings
        X_transformed['DomainLength'] = X_transformed['Domain'].apply(lambda x: len(str(x)) if pd.notna(x) else None)
        
        return X_transformed

In [10]:
class HandleIsHTTPS(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # No fitting process needed
        return self
    
    def transform(self, X):
        # Copy the DataFrame to avoid modifying the original
        X_transformed = X.copy()
        
        # Drop rows where URL is completely NaN
        X_transformed = X_transformed.dropna(subset=['URL'])
        
        # Check if the URL contains HTTPS
        X_transformed['IsHTTPS'] = X_transformed['URL'].apply(
            lambda x: 1 if pd.notna(x) and urlparse(x).scheme.lower() == 'https' else 0
        )
        
        return X_transformed

In [11]:
import re

class HandleCharContinuationRate(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # No fitting process needed
        return self
    
    def transform(self, X):
        # Copy the DataFrame to avoid modifying the original
        X_transformed = X.copy()
        
        # Function to calculate CharContinuationRate
        def calculate_char_cont_rate(url):
            if not isinstance(url, str):
                return None  # Return None for invalid URLs
            
            # Find all alphabet, digit, or special character sequences
            sequences = re.findall(r'[a-zA-Z]+|\d+|[^a-zA-Z\d]+', url)
            
            # Calculate total length of sequences
            total_sequence_length = sum(len(seq) for seq in sequences)
            
            # Return the CharContinuationRate
            return total_sequence_length / len(url) if len(url) > 0 else 0

        # Impute missing CharContinuationRate values
        X_transformed['CharContinuationRate'] = X_transformed['CharContinuationRate'].fillna(
            X_transformed['URL'].apply(lambda x: calculate_char_cont_rate(x))
        )
        
        return X_transformed

In [12]:
class HandleHasTitle(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # No fitting process needed
        return self
    
    def transform(self, X):
        # Copy the DataFrame to avoid modifying the original
        X_transformed = X.copy()
        
        # Create HasTitle column: 1 if Title is not NaN, 0 otherwise
        X_transformed['HasTitle'] = X_transformed['Title'].apply(lambda x: 0 if pd.isna(x) else 1)
        
        return X_transformed

In [13]:
import re

class HandleURLTitleMatchScore(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # No fitting process needed
        return self
    
    def transform(self, X):
        
        X_transformed = X.copy()
    
        def url_title_match_score(title, url):
            if not isinstance(title, str) or not isinstance(url, str):
                return None  # Return None for invalid inputs
            
            # Split the title into a set of words
            t_set = title.split()
            
            # Clean the URL (remove https, http, www, and extract root domain)
            clean_url = re.sub(r'https?://|www\.', '', url).split('/')[0]  # Keep root domain
            
            # Compute baseScore
            base_score = 100 / len(clean_url) if len(clean_url) > 0 else 0
            
            # Calculate score
            score = 0
            for word in t_set:
                if word in clean_url:
                    score += base_score * len(word)
                    clean_url = clean_url.replace(word, "")  # Remove matched word from URL
                if score > 99.9:
                    score = 100
                    break
            
            return score
        
        X_transformed['URLTitleMatchScore'] = X_transformed['URLTitleMatchScore'].fillna(
            X_transformed.apply(
                lambda row: url_title_match_score(row['Title'], row['URL']), axis=1
            )
        )
        
        return X_transformed

In [14]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.tree import DecisionTreeRegressor

class FeatureImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # Separate columns by type
        self.categorical_cols = ['TLD', 'Domain', 'FILENAME', 'URL', 'Title']
        self.boolean_cols = [
            'IsDomainIP', 'HasObfuscation', 'IsHTTPS', 'HasTitle', 'HasFavicon',
            'Robots', 'IsResponsive', 'NoOfURLRedirect', 'NoOfSelfRedirect', 'HasDescription',
            'HasExternalFormSubmit', 'HasSocialNet', 'HasSubmitButton', 'HasHiddenFields',
            'HasPasswordField', 'Bank', 'Pay', 'Crypto', 'HasCopyrightInfo'
        ]
        self.numerical_cols = list(set(X.columns) - set(self.categorical_cols) - set(self.boolean_cols))
        
        # Imputer for categorical columns (most frequent)
        self.cat_imputer = SimpleImputer(strategy='most_frequent')
        self.cat_imputer.fit(X[self.categorical_cols])
        
        # Imputer for boolean columns (mode)
        self.bool_mode_imputer = {}
        for col in self.boolean_cols:
            self.bool_mode_imputer[col] = X[col].mode()[0]
        
        # KNN Imputer for numerical columns
        self.knn_imputer = IterativeImputer(estimator=DecisionTreeRegressor(random_state=42))
        self.knn_imputer.fit(X[self.numerical_cols])
        
        return self
    
    def transform(self, X):
        X_imputed = X.copy()

        # Impute categorical columns
        X_imputed[self.categorical_cols] = self.cat_imputer.transform(X_imputed[self.categorical_cols])

        # Impute boolean columns using pre-calculated mode
        for col in self.boolean_cols:
            X_imputed[col].fillna(self.bool_mode_imputer[col], inplace=True)
            
        # Impute numerical columns using KNNImputer
        X_imputed[self.numerical_cols] = self.knn_imputer.transform(X_imputed[self.numerical_cols])
        
        return X_imputed

In [15]:
class FeatureDropper(BaseEstimator, TransformerMixin):
    def __init__(self, features_to_drop):
        self.features_to_drop = features_to_drop

    def fit(self, X, y=None):
        return self 
    
    def transform(self, X):
        # Drop the specified features
        X_dropped = X.drop(columns=self.features_to_drop, errors='ignore')
        return X_dropped

In [16]:
FEATURES_TO_DROP = [
    'Domain', 'TLD', 'FILENAME', 'URL', 'Title', 'TLD'
]

In [17]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([ ("length", HandleLength()),
                      ("https", HandleIsHTTPS()),
                      ("title", HandleHasTitle()),
                      #("char", HandleCharContinuationRate()),
                      ("urltitle", HandleURLTitleMatchScore()),
                      ("imputer", FeatureImputer()),
                      ("dropper", FeatureDropper(FEATURES_TO_DROP)),
                     ])

In [18]:
train_set = pipeline.fit_transform(train_set)
val_set = pipeline.transform(val_set)



In [19]:
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
import numpy as np
import pandas as pd

def classify_categorical_features(data):
    """
    Classify categorical features, including binary-like float columns.
    """
    # Identify float columns (0, 1, or NaN)
    binary_float_features = data.select_dtypes(include=['float']).columns
    binary_like_features = [
        col for col in binary_float_features 
        if data[col].dropna().isin([0, 1]).all()
    ]

    # Identify categorical columns
    categorical_features = data.select_dtypes(include=['object', 'category']).columns

    # Combine binary-like float and categorical columns
    all_categorical_features = list(binary_like_features) + list(categorical_features)

    # Exclude specific columns
    excluded_columns = ['NoOfURLRedirect', 'NoOfSelfRedirect', 'FILENAME', 'URL', 'Domain', 'Title']
    all_categorical_features = [
        feature for feature in all_categorical_features if feature not in excluded_columns
    ]

    # Ensure the label column is included
    all_categorical_features.append('label')

    return all_categorical_features

def handle_outliers(data, method="iterative", estimator=None):
    """
    Handle outliers in numerical columns, excluding categorical columns.
    """
    if method == "iterative" and estimator is None:
        # Default estimator for Iterative Imputer
        estimator = BayesianRidge()

    # Identify categorical features
    categorical_features = classify_categorical_features(data)

    # Process only numerical columns that are not categorical
    numerical_columns = [
        col for col in data.select_dtypes(include=['int64', 'float64']).columns 
        if col not in categorical_features
    ]

    for col in numerical_columns:
        if data[col].nunique() > 2:
            # Detect outliers using IQR
            q25, q75 = np.percentile(data[col].dropna(), 25), np.percentile(data[col].dropna(), 75)
            iqr = q75 - q25
            cut_off = iqr * 1.5
            lower, upper = q25 - cut_off, q75 + cut_off

            # Mark outliers as NaN
            data.loc[(data[col] < lower) | (data[col] > upper), col] = np.nan

            # Handle missing values using the chosen method
            if method == "iterative":
                imputer = IterativeImputer(estimator=estimator, random_state=42)
                data[[col]] = imputer.fit_transform(data[[col]])
            elif method == "median":
                data[col].fillna(data[col].median(), inplace=True)
            else:
                data[col].dropna(inplace=True)

    return data


In [20]:
train_set = handle_outliers(train_set.copy(), method="iterative", estimator=DecisionTreeRegressor(random_state=42))
val_set = handle_outliers(val_set.copy(), method="iterative", estimator=DecisionTreeRegressor(random_state=42))

In [21]:
print("Rows before dropping duplicates:", len(train_set))
train_set.drop_duplicates(inplace=True)
print("Rows after dropping duplicates:", len(train_set))

train_set.reset_index(drop=True, inplace=True)

Rows before dropping duplicates: 77628
Rows after dropping duplicates: 77628


In [22]:

FEATURES_TO_DROP2 = [
'NoOfLettersInURL', 
'DomainLength', 
'DomainTitleMatchScore',
'CharContinuationRate',
'TLDLength',
'TLDLegitimateProb',
"NoOfSubDomain",
'NoOfObfuscatedChar',
"ObfuscationRatio",
'NoOfDegitsInURL',
'DegitRatioInURL',
'NoOfEqualsInURL',
'NoOfQMarkInURL',
'NoOfAmpersandInURL',
'NoOfPopUp'
]

In [23]:
train_set['IsHighURLMatchScore'] = train_set['URLTitleMatchScore'].apply(lambda x: 1 if x > 20 else 0)
val_set['IsHighURLMatchScore'] = val_set['URLTitleMatchScore'].apply(lambda x: 1 if x > 20 else 0)

In [24]:
train_set.dtypes

id                            float64
URLLength                     float64
DomainLength                  float64
IsDomainIP                    float64
CharContinuationRate          float64
TLDLegitimateProb             float64
URLCharProb                   float64
TLDLength                     float64
NoOfSubDomain                 float64
HasObfuscation                float64
NoOfObfuscatedChar            float64
ObfuscationRatio              float64
NoOfLettersInURL              float64
LetterRatioInURL              float64
NoOfDegitsInURL               float64
DegitRatioInURL               float64
NoOfEqualsInURL               float64
NoOfQMarkInURL                float64
NoOfAmpersandInURL            float64
NoOfOtherSpecialCharsInURL    float64
SpacialCharRatioInURL         float64
IsHTTPS                         int64
LineOfCode                    float64
LargestLineLength             float64
HasTitle                        int64
DomainTitleMatchScore         float64
URLTitleMatc

In [25]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import RobustScaler
import numpy as np
import pandas as pd

class FeatureScaler(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.scaler = RobustScaler()
        self.num_columns = None 
        self.url_title_column = 'URLTitleMatchScore'

    def fit(self, X, y=None):
        # Identify numerical columns excluding the 'URLTitleMatchScore' column and columns with nunique=2
        self.num_columns = [
            col for col in X.select_dtypes(include=['int64', 'float64']).columns
            if col != self.url_title_column and X[col].nunique() > 2
        ]
        
        # Fit the RobustScaler to the numerical columns
        self.scaler.fit(X[self.num_columns])
        return self

    def transform(self, X):
        # Copy the DataFrame to avoid modifying the original
        X_transformed = X.copy()
        
        # Apply RobustScaler to the numerical columns
        if self.num_columns:  # Ensure there are columns to scale
            X_transformed[self.num_columns] = self.scaler.transform(X[self.num_columns])
        
        # Apply Log Transformation to 'URLTitleMatchScore'
        if self.url_title_column in X_transformed.columns:
            X_transformed[self.url_title_column] = X_transformed[self.url_title_column].apply(
                lambda x: np.log1p(x) if pd.notna(x) and x >= 0 else np.nan
            )
        
        return X_transformed

In [26]:
from imblearn.over_sampling import SMOTE

def balance_classes(X_t, y_t):
    sm = SMOTE(random_state=42)
    X_t_res, y_t_res = sm.fit_resample(X_t, y_t)
    return X_t_res, y_t_res

In [27]:
from sklearn.decomposition import PCA
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class PCATransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_components=None):
        self.n_components = n_components
        self.pca = PCA(n_components=self.n_components)
        self.num_columns = None
        self.pca_columns = None

    def fit(self, X, y=None):
        # Identify numerical columns
        numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
        
        # Filter for continuous columns (unique values > 2)
        self.num_columns = [col for col in numerical_cols if X[col].nunique() > 2]
        
        # Fit PCA on the selected numerical columns
        self.pca.fit(X[self.num_columns])
        
        # Create column names for PCA components
        self.pca_columns = [f'PCA_{i+1}' for i in range(self.pca.n_components_)]
        return self

    def transform(self, X):
        """
        Apply PCA transformation to the selected columns and return the modified DataFrame.
        """
        X_transformed = X.copy()
        
        # Apply PCA to the selected numerical columns
        pca_result = self.pca.transform(X[self.num_columns])
        
        # Create a DataFrame for PCA components
        pca_df = pd.DataFrame(pca_result, columns=self.pca_columns, index=X.index)
        
        # Drop the original numerical columns and concatenate PCA components
        X_transformed = X_transformed.drop(columns=self.num_columns, errors='ignore')
        X_transformed = pd.concat([X_transformed, pca_df], axis=1)
        
        return X_transformed

In [28]:
from sklearn.pipeline import Pipeline

# Note: You can add or delete preprocessing components from this pipeline

pipe = Pipeline([("pca", PCATransformer()),
                ("dropper",FeatureDropper(FEATURES_TO_DROP2)),
                 ("scaler", FeatureScaler()),
])

train_set = pipe.fit_transform(train_set)
val_set = pipe.transform(val_set)

In [29]:
def match_columns(train, test):
    # Get list of columns in training set
    train_cols = train.columns.tolist()
    
    # Get list of columns in test set
    test_cols = test.columns.tolist()
    
    # Remove any columns in test set that aren't in training set
    for col in test_cols:
        if col not in train_cols:
            test = test.drop(col, axis=1)
    
    # Add any missing columns to test set and fill with 0
    for col in train_cols:
        if col not in test_cols:
            test[col] = 0
    
    # Reorder columns in test set to match training set
    test = test[train_cols]
    
    # Return modified test set
    return test

In [30]:
# Match the columns
val_set = match_columns(train_set, val_set)

In [31]:
train_set.columns = train_set.columns.str.strip()

In [32]:

X_train = train_set.drop(['label'], axis=1)
y_train = train_set['label']

In [33]:
# Perform resampling
X_train, y_train = balance_classes(X_train, y_train)

In [34]:
X_test = val_set.drop(['label'], axis=1)
y_test = val_set['label']

In [35]:
output_path = "X_train_clean.csv"  # Nama file output
X_train.to_csv(output_path, index=False) 

output_path = "y_train_clean.csv"  # Nama file output
y_train.to_csv(output_path, index=False) 

output_path = "X_test_clean.csv"  # Nama file output
X_test.to_csv(output_path, index=False) 

output_path = "y_test_clean.csv"  # Nama file output
y_test.to_csv(output_path, index=False) 

In [36]:
X_train_clean = pd.read_csv('X_train_clean.csv')
y_train_clean = pd.read_csv('y_train_clean.csv')
X_test_clean = pd.read_csv('X_test_clean.csv')
y_test_clean = pd.read_csv('y_test_clean.csv')

In [37]:
import numpy as np
from concurrent.futures import ThreadPoolExecutor

class KNN:
    def __init__(self, k=3, metric='mixed', p=1):
        """
        metric='mixed' handles both numerical and categorical data.
        """
        self.k = k
        self.metric = metric
        self.p = p

    def fit(self, X, y, categorical_mask=None):
        """
        Fit the model with training data and labels.
        Parameters:
        - categorical_mask: A boolean array where True indicates a categorical column.
        """
        self.X_train = np.array(X, dtype=object)
        self.y_train = np.array(y, dtype=np.int32)

        if categorical_mask is None:
            self.categorical_mask = np.array([isinstance(v, str) for v in X.iloc[0]])
        else:
            self.categorical_mask = np.array(categorical_mask)

        self.numerical_mask = ~self.categorical_mask

    def _compute_distance(self, x1, x2):
        """
        Compute mixed distance for one sample.
        - Categorical distance: Hamming distance.
        - Numerical distance: Euclidean distance.
        """

        cat_distance = np.sum(x1[self.categorical_mask] != x2[self.categorical_mask])

        num_distance = np.sqrt(
            np.sum((x1[self.numerical_mask].astype(float) - x2[self.numerical_mask].astype(float)) ** 2)
        )

        # Combine distances
        return cat_distance + num_distance

    def _get_neighbors(self, x):
        """
        Find k-nearest neighbors for a single sample.
        """
        distances = [self._compute_distance(x, x_train) for x_train in self.X_train]
        neighbors_idx = np.argsort(distances)[:self.k]
        neighbors_distances = np.array(distances)[neighbors_idx]
        return neighbors_idx, neighbors_distances

    def _predict_single(self, x):
        """
        Predict the label for a single sample using weighted voting.
        """
        neighbors_idx, neighbors_distances = self._get_neighbors(x)
        neighbor_labels = self.y_train[neighbors_idx]

        weights = 1 / (neighbors_distances + 1e-5)
        weighted_votes = {}
        for label, weight in zip(neighbor_labels, weights):
            weighted_votes[label] = weighted_votes.get(label, 0) + weight

        return max(weighted_votes, key=weighted_votes.get)

    def predict(self, X_test):
        """
        Predict for the test set.
        """
        X_test = np.array(X_test, dtype=object)
        predictions = [self._predict_single(x) for x in X_test]
        return np.array(predictions)

    def predict_parallel(self, X_test, num_workers=4):
        """
        Parallelize predictions across multiple CPU cores.
        """
        X_test_split = np.array_split(X_test, num_workers)
        with ThreadPoolExecutor(max_workers=num_workers) as executor:
            results = list(executor.map(self.predict, X_test_split))
        return np.concatenate(results)


In [38]:
def classify_features(df):
    categorical_mask = []
    for column in df.columns:
        unique_values = df[column].unique()
        if len(unique_values) == 2 and set(unique_values).issubset({0, 1, 0.0, 1.0}):
            categorical_mask.append(True)
        elif df[column].dtype == 'object' or isinstance(unique_values[0], str):
            categorical_mask.append(True)
        else:
            categorical_mask.append(False)
    return categorical_mask
categorical_mask = classify_features(X_test)

print("Categorical Mask:", categorical_mask)
print(categorical_mask)

Categorical Mask: [True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
[True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]


In [40]:
categorical_mask = [True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
[True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
knn = KNN(k=3, metric='mixed')
knn.fit(X_train_clean, y_train_clean, categorical_mask=categorical_mask)

predictions = knn.predict(X_test)
print("Predictions:", predictions)

TypeError: unhashable type: 'numpy.ndarray'

In [37]:
knn = KNN(k=5, metric='manhattan')
knn.fit(X_train, y_train)
preds = knn.predict(X_test)
# accuracy = knn.evaluate(X_test, y_test)
# print(accuracy)
# Dynamically determine unique classes
unique_classes = np.unique(y_test)
target_names = [f"Class {c}" for c in unique_classes]

# Generate detailed classification report
report = classification_report(y_test, preds, target_names=target_names)
print("Classification Report:")
print(report)

KeyboardInterrupt: 

In [None]:
import optuna

# Define the Optuna objective function
def objective(trial):
    # Suggest values for the hyperparameters
    n_neighbors = trial.suggest_int('n_neighbors', 1, 20)
    metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan', 'minkowski'])
    
    # Create and train the model
    model = KNN(
        n_neighbors=n_neighbors,
        metric=metric,
    )
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Return the accuracy as the objective to maximize
    return accuracy_score(y_test, y_pred)

# Create an Optuna study
study = optuna.create_study(direction='maximize')  # We want to maximize accuracy
study.optimize(objective, n_trials=50)  # Adjust n_trials for exploration depth

# Print the best parameters and accuracy
print("Best Parameters:", study.best_params)
print("Best Accuracy:", study.best_value)

[I 2024-12-19 18:43:17,359] A new study created in memory with name: no-name-fded2a58-0fbe-4d59-b5af-63dc819010f4
[I 2024-12-19 18:43:17,374] Trial 0 finished with value: 0.8859649122807017 and parameters: {'n_neighbors': 4, 'weights': 'distance', 'metric': 'minkowski', 'p': 2, 'algorithm': 'kd_tree', 'leaf_size': 30}. Best is trial 0 with value: 0.8859649122807017.
[I 2024-12-19 18:43:17,390] Trial 1 finished with value: 0.9035087719298246 and parameters: {'n_neighbors': 3, 'weights': 'uniform', 'metric': 'manhattan', 'algorithm': 'kd_tree', 'leaf_size': 10}. Best is trial 1 with value: 0.9035087719298246.
[I 2024-12-19 18:43:17,432] Trial 2 finished with value: 0.8859649122807017 and parameters: {'n_neighbors': 16, 'weights': 'uniform', 'metric': 'minkowski', 'p': 3, 'algorithm': 'brute', 'leaf_size': 20}. Best is trial 1 with value: 0.9035087719298246.
[I 2024-12-19 18:43:17,449] Trial 3 finished with value: 0.9122807017543859 and parameters: {'n_neighbors': 4, 'weights': 'uniform',

Best Parameters: {'n_neighbors': 4, 'weights': 'uniform', 'metric': 'manhattan', 'algorithm': 'ball_tree', 'leaf_size': 40}
Best Accuracy: 0.9210526315789473
