In [72]:
import pandas as pd
import pickle
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

In [73]:
def preprocess_code(code):
    code = str(code)
    code = re.sub(r'/\*.*?\*/', ' ', code, flags=re.DOTALL)
    code = re.sub(r'\s+', ' ', code)
    code = re.sub(r'<.*?>', ' ', code)
    code = re.sub(r'\b(?:package|import|public|class|function|php|<\?php|\?>)\b', ' ', code)
    code = code.lower()
    return code.strip()


In [74]:
with open('../static/model/model.pickle', 'rb') as file:
    rf = pickle.load(file)

with open('../static/model/vectorizer.pickle', 'rb') as f:
    vectorizer = pickle.load(f)

print("✅ Model and vectorizer loaded successfully!")



✅ Model and vectorizer loaded successfully!


In [75]:
def get_prediction(vectorized_text):
    prediction = rf.predict(vectorized_text)
    if prediction == 1:
        return 'Vulnerable'
    else:
        return 'Safe'


In [76]:
test_samples = {
    "Safe Python": """
    def add(a, b):
        return a + b

    print(add(2, 3))
    """,
    "Vulnerable C (buffer overflow)": """
    #include <stdio.h>
    #include <string.h>

    int main() {
        char buffer[10];
        strcpy(buffer, "AAAAAAAAAAAAAAAAAAAA");
        return 0;
    }
    """,
    "Vulnerable C (format string)": """
    #include <stdio.h>

    int main(int argc, char *argv[]) {
        printf(argv[1]);
        return 0;
    }
    """,
    "Safe C": """
    #include <stdio.h>

    int main() {
        printf("Hello, World!");
        return 0;
    }
    """,
    "Vulnerable C (gets)": """
    #include <stdio.h>

    int main() {
        char name[50];
        gets(name);
        printf("Hello %s", name);
        return 0;
    }
    """
}


In [77]:
print("\n🔍 Batch Test Predictions:\n")
for label, code in test_samples.items():
    cleaned_code = preprocess_code(code)
    vectorized_code = vectorizer.transform([cleaned_code])
    prediction = get_prediction(vectorized_code)
    print(f"{label}: {prediction}")



🔍 Batch Test Predictions:

Safe Python: Safe
Vulnerable C (buffer overflow): Vulnerable
Vulnerable C (format string): Vulnerable
Safe C: Safe
Vulnerable C (gets): Vulnerable


## Optional

In [78]:
def get_prediction_with_threshold(vectorized_text, threshold=0.7):
    proba = rf.predict_proba(vectorized_text)
    if proba[0][1] > threshold:
        return 'Vulnerable'
    else:
        return 'Safe'


In [79]:
print("\n🔍 Batch Test Predictions (with threshold):\n")
for label, code in test_samples.items():
    cleaned_code = preprocess_code(code)
    vectorized_code = vectorizer.transform([cleaned_code])
    prediction = get_prediction_with_threshold(vectorized_code, threshold=0.7)
    print(f"{label}: {prediction}")



🔍 Batch Test Predictions (with threshold):

Safe Python: Safe
Vulnerable C (buffer overflow): Vulnerable
Vulnerable C (format string): Vulnerable
Safe C: Safe
Vulnerable C (gets): Vulnerable


In [80]:
def get_prediction(vectorized_text):
    prediction = rf.predict(vectorized_text)
    if prediction == 1:
        return 'Vulnerable'
    else:
        return 'Safe'


In [81]:
txt = """
#include <stdio.h>
#include <string.h>

int main() {
    char buffer[10];
    strcpy(buffer, "AAAAAAAAAAAAAAAAAAAA");
    return 0;
}
"""

preprocessed_txt = preprocess_code(txt)
vectorized_txt = vectorizer.transform([preprocessed_txt])
prediction = get_prediction(vectorized_txt)
print(prediction)


Vulnerable


# 📦 Final Notes on Prediction Pipeline

✅ This notebook implements a complete prediction pipeline using the trained Random Forest model and TF-IDF vectorizer.

### ✨ Key Features

- **Batch Test Predictions**  
  Runs multiple test samples through the pipeline and outputs `Safe` or `Vulnerable` predictions.

- **Threshold-Based Prediction (Optional)**  
  Uses the model's probability score (`predict_proba`) to apply a confidence threshold.  
  Example: With a threshold of 0.7, the model only labels a sample as `Vulnerable` if it is more than 70% confident.

### 💡 Why Use Threshold-Based Prediction?

- Reduces false positives by requiring high confidence before predicting `Vulnerable`.
- Helps balance between sensitivity (detecting vulnerabilities) and specificity (avoiding false alarms).
- Allows you to adjust the system for stricter or looser detection based on your use case.

### 📊 Notes on Results

- **Good performance on Python, C, and web languages**
- **Buffer overflow and format string vulnerabilities detected well**
- **Some C `gets()` cases may need more training data to improve detection**

### 🔧 Recommendations

- Keep tuning the threshold value (`0.5`, `0.6`, `0.7`) based on testing.
- Consider adding more safe and vulnerable C samples to improve edge-case handling.
- Optionally integrate this pipeline into a web app for user-friendly predictions.



