In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import joblib

# Load Sigma rules data
sigma_rules_data = pd.read_csv("sigma_rules_data.csv")  # Replace "sigma_rules_data.csv" with your dataset path

# Preprocessing
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return " ".join(tokens)

sigma_rules_data['processed_description'] = sigma_rules_data['description'].apply(preprocess_text)

# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X = tfidf_vectorizer.fit_transform(sigma_rules_data['processed_description'])
y = sigma_rules_data['sigma_rule']

# Train the model
model = Pipeline([
    ('clf', LogisticRegression())  # Example classifier, you can replace it with any other classifier
])
model.fit(X, y)

# Save the trained model and vectorizer
joblib.dump(model, 'sigma_rule_model.joblib')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')

# Load the trained model and vectorizer
model = joblib.load('sigma_rule_model.joblib')
tfidf_vectorizer = joblib.load('tfidf_vectorizer.joblib')

def predict_sigma_rule(description):
    preprocessed_description = preprocess_text(description)
    description_tfidf = tfidf_vectorizer.transform([preprocessed_description])
    sigma_rule = model.predict(description_tfidf)
    return sigma_rule[0]

# Example usage
input_description = "Detects suspicious activity in system logs."
predicted_sigma_rule = predict_sigma_rule(input_description)
print("Generated Sigma Rule:", predicted_sigma_rule)
