In [46]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/malware-analysis-hackathon/training_network_trails
/kaggle/input/malware-analysis-hackathon/evaluation_os_trails
/kaggle/input/malware-analysis-hackathon/evaluation_network_trails
/kaggle/input/malware-analysis-hackathon/training_os_trails
/kaggle/input/malware-analysis-hackathon/evaluation_hardware_trails
/kaggle/input/malware-analysis-hackathon/training_hardware_trails


In [47]:
#### Read the input

training_network_file = "/kaggle/input/malware-analysis-hackathon/training_network_trails"
training_os_file = "/kaggle/input/malware-analysis-hackathon/training_os_trails"
training_hardware_file = "/kaggle/input/malware-analysis-hackathon/training_hardware_trails"
test_network_file = "/kaggle/input/malware-analysis-hackathon/evaluation_network_trails"
test_os_file = "/kaggle/input/malware-analysis-hackathon/evaluation_os_trails"
test_hardware_file = "/kaggle/input/malware-analysis-hackathon/evaluation_hardware_trails"


network_train = pd.read_csv(training_network_file)
os_train = pd.read_csv(training_os_file)
hardware_train = pd.read_csv(training_hardware_file)

network_eval = pd.read_csv(test_network_file)
os_eval = pd.read_csv(test_os_file)
hardware_eval = pd.read_csv(test_hardware_file)

# raw_data = pd.read_csv("/kaggle/input/malware-detection/Malware dataset.csv")
# raw_data.head()

In [48]:
network_train = network_train.drop(['Dest_IP', 'Dest_P', 'Src_P'], axis=1)

In [49]:
# data['classification'] = data.classification.map({'benign':0, 'malware':1})
# data.head()

In [57]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Preprocessing function
def preprocess_data(df):
    numeric_cols = df.select_dtypes(include=['number']).columns
    non_numeric_cols = df.select_dtypes(include=['object']).columns

    # Fill missing values in numeric columns with the mean
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
    
    # Encode non-numeric (categorical) columns
    label_encoders = {}
    for col in non_numeric_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le  # Store encoders for reverse mapping if needed

    return df, label_encoders

In [77]:
# Neural Network model definition
def build_model(input_dim, output_dim):
    model = Sequential()
    model.add(Dense(128, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.3))  # Add dropout to avoid overfitting

    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))

    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))

    model.add(Dense(output_dim, activation='softmax'))  # Output layer for multi-class classification

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

In [78]:
# Train and Predict Function
def train_and_predict(df_train, df_eval, target_column, eval_ids, csv_filename):
    # Preprocess the data
    df_train, label_encoders = preprocess_data(df_train)
    df_eval, _ = preprocess_data(df_eval)
    
    # Separate features and target for training
    X_train = df_train.drop(columns=[target_column])
    y_train = df_train[target_column]
    X_eval = df_eval.drop(columns=[target_column], errors='ignore')  # Evaluation data has no target
    
    # Scale the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_eval = scaler.transform(X_eval)
    
    # Build and train the model
    model = build_model(input_dim=X_train.shape[1], output_dim=9)  # 9 classes
    model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.2)
    
    # Predict on the evaluation set
    y_eval_pred = model.predict(X_eval)
    y_eval_pred_classes = y_eval_pred.argmax(axis=1)  # Get class with max probability
    
    # Reverse encoding for the 'goal' column
#     le_goal = label_encoders.get(target_column)
#     y_eval_pred_labels = le_goal.inverse_transform()
    
    # Create DataFrame for the output
    output_df = pd.DataFrame({
        'Id': eval_ids,
        'Goal': y_eval_pred_classes
    })
    
    # Save to CSV
    output_df.to_csv(csv_filename, index=False)
    print(f"Predictions saved to {csv_filename}")

In [79]:
# Combine all predictions and save to CSV
def save_predictions(eval_ids, y_eval_pred, type_):
    output_df = pd.DataFrame({
        'Id': eval_ids,  # 'Id' column from the evaluation dataset
        'Goal': y_eval_pred  # Predicted 'Goal' labels
    })

    # Save the predictions to a CSV file
    output_filename = f"{type_}.csv"
    output_df.to_csv(output_filename, index=False)
    print(f"Predictions saved to {output_filename}")


In [74]:
# Combine CSVs Function
def combine_csvs(csv_files, output_filename):
    dfs = [pd.read_csv(csv) for csv in csv_files]
    combined_df = pd.concat(dfs, ignore_index=True)
    combined_df.to_csv(output_filename, index=False)
    print(f"Combined CSV saved to {output_filename}")

# Create eval_ids for the hardware dataset
eval_ids_h = pd.Series(range(1, len(hardware_eval) + 1))

# Create eval_ids for the network dataset
eval_ids_n = pd.Series(range(len(hardware_eval) + 1, len(hardware_eval) + len(network_eval) + 1))

# Create eval_ids for the OS dataset
eval_ids_o = pd.Series(range(len(hardware_eval) + len(network_eval) + 1, len(hardware_eval) + len(network_eval) + len(os_eval) + 1))



train_and_predict(hardware_train, hardware_eval, 'goal', eval_ids_h, 'hardware_predictions.csv')
train_and_predict(network_train, network_eval, 'goal', eval_ids_n, 'network_predictions.csv')
train_and_predict(os_train, os_eval, 'goal', eval_ids_o, 'os_predictions.csv')

# Combine all CSVs into one final CSV
combine_csvs(['hardware_predictions.csv', 'network_predictions.csv', 'os_predictions.csv'], 'final_predictions.csv')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Predictions saved to hardware_predictions.csv
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Predictions saved to network_predictions.csv
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Predictions saved to os_predictions.csv
Combined CSV saved to final_predictions.csv


In [76]:
goal_mapping = {
    0:'backdoor',
    1:'banker',
    2:'cryptominer',
    3:'deceptor',
    4:'downloader',
    5:'normal',
    6:'pua',
    7:'ransomware',
    8:'spyware',
}

# Concatenate the datasets into a single DataFrame
combined_df = pd.read_csv("final_predictions.csv")
combined_df['Goal'] = combined_df['Goal'].map(goal_mapping)
combined_df.to_csv('combined_predictons_with_goals.csv', index=False)

In [None]:
# corrMatrix = data.corr()
# sns.heatmap(corrMatrix, annot=True)
# plt.show()