In [1]:
import os
import random

dataset_path = "C:/Users/bhosl/OneDrive/Desktop/sdddi/dataset"

base_snippets = {
    "Python": [
        'print("Hello, World!")',
        'for i in range({n}): print(i)',
        'def square(x): return x * x',
        'import math\nprint(math.{func})',
        'if {a} > {b}: print("Greater")',
        'x = [{n} * i for i in range({m})]',
        'try:\n    1/0\nexcept:\n    print("Error")',
        'class Person:\n    def __init__(self, name):\n        self.name = name',
        'lambda x: x + {n}',
        'with open("file.txt") as f:\n    print(f.read())'
    ],
    "C": [
        '#include <stdio.h>\nint main() { printf("Hi {n}!"); return 0; }',
        'int add(int a, int b) { return a + b; }',
        'for(int i=0; i<{n}; i++) printf("%d", i);',
        'char name[] = "C Lang {n}";',
        'if ({a} > {b}) printf("True");',
        'int x = {a}, y = {b};',
        '#define PI {pi}',
        'while({n}) break;',
        'void greet() { printf("Hey {n}!"); }',
        '#include <math.h>\nprintf("%f", sqrt({n}));'
    ],
    "Java": [
        'public class HelloWorld {{ public static void main(String[] args) {{ System.out.println("Hello {n}"); }} }}',
        'int x = {a} + {b};',
        'for(int i=0; i<{n}; i++) System.out.println(i);',
        'String name = "Java {n}";',
        'if({a} < {b}) System.out.println("Yes");',
        'int[] arr = new int[] {{{a},{b},{n}}};',
        'try {{ int x = 1/0; }} catch(Exception e) {{ System.out.println("Err"); }}',
        'Scanner sc = new Scanner(System.in);',
        'public static void greet() {{ System.out.println("Hey!"); }}',
        'double pi = Math.PI * {n};'
    ],
    "JavaScript": [
        'console.log("Hello {n}!");',
        'let x = {a} + {b};',
        'for(let i=0; i<{n}; i++) console.log(i);',
        'function add(a, b) {{ return a + b + {n}; }}',
        'if ({a} > {b}) console.log("Yes");',
        'const arr = [{a},{b},{n}];',
        'document.write("JS {n}");',
        'let obj = {{name: "Lang {n}"}};',
        'setTimeout(() => console.log("Waited {n}"), 1000);',
        'window.alert("Hey {n}");'
    ],
    "CSS": [
        'body {{ background: #{color}; }}',
        'h1 {{ color: #{color}; }}',
        'p {{ font-size: {n}px; }}',
        '.box{n} {{ padding: {n}px; }}',
        '#main{n} {{ border: {n}px solid #{color}; }}',
        '* {{ margin: {a}px; }}',
        'div:hover {{ color: #{color}; }}',
        'input:focus {{ outline: none; }}',
        'ul li {{ list-style: none; }}',
        'a {{ text-decoration: none; color: #{color}; }}'
    ]
}

def get_random_params():
    return {
        "n": random.randint(1, 100),
        "m": random.randint(1, 20),
        "a": random.randint(1, 50),
        "b": random.randint(1, 50),
        "pi": round(random.uniform(3.1, 3.2), 2),
        "func": random.choice(["pi", "sqrt", "log", "sin"]),
        "color": ''.join(random.choices('0123456789ABCDEF', k=6))
    }

for language, templates in base_snippets.items():
    folder = os.path.join(dataset_path, language)
    os.makedirs(folder, exist_ok=True)
    
    ext = {
        "Python": "py",
        "C": "c",
        "Java": "java",
        "JavaScript": "js",
        "CSS": "css"
    }[language]

    for i in range(50):
        template = random.choice(templates)
        params = get_random_params()
        code = template.format(**params)
        file_path = os.path.join(folder, f"example_{i+1}.{ext}")
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(code)

print("250 Code files generated! (50 per language × 5)")


KeyError: ' return a + b; '

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report

In [2]:
dataset_path = "C:/Users/bhosl/OneDrive/Desktop/sdddi/dataset"
data = []

In [3]:
for language in os.listdir(dataset_path):
    language_folder = os.path.join(dataset_path, language)
    if not os.path.isdir(language_folder):
        continue  # Skip if it's not a folder

    # Read every file inside that folder
    for file_name in os.listdir(language_folder):
        file_path = os.path.join(language_folder, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            code = file.read()
            data.append((code, language))  # Store (code, language)

In [4]:
df = pd.DataFrame(data, columns=["code", "language"])

In [5]:
vectorizer = TfidfVectorizer(max_features=1000)  
X = vectorizer.fit_transform(df["code"]).toarray()
y = df["language"]  # Labels (Python, Java, etc.)
feature_names = vectorizer.get_feature_names_out()


In [6]:
X

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.51134331],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]], shape=(300, 109))

In [7]:
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)
selected_features = feature_names[selector.get_support()]

print("\n✅ Top 10 most useful words (features) from the code:")
print(selected_features)


✅ Top 10 most useful words (features) from the code:
['class' 'console' 'float' 'log' 'out' 'print' 'printf' 'println' 'system'
 'void']


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [9]:
print("\n Model Performance on Test Data:")
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


 Model Performance on Test Data:
              precision    recall  f1-score   support

           C       1.00      1.00      1.00        10
         CSS       0.00      0.00      0.00        12
        Java       1.00      1.00      1.00         7
  JavaScript       1.00      0.78      0.88         9
      Python       0.61      1.00      0.76        22

    accuracy                           0.77        60
   macro avg       0.72      0.76      0.73        60
weighted avg       0.66      0.77      0.69        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
def explain_prediction(code_snippet):
    print("\n You Entered:\n", code_snippet)

    # Convert user input to same vector format
    input_vector = vectorizer.transform([code_snippet]).toarray()
    input_selected = selector.transform(input_vector)

    # Predict and get probabilities
    prediction = model.predict(input_selected)[0]
    probabilities = model.predict_proba(input_selected)[0]

    print(f"\n Predicted Language: {prediction}")
    print("\n Confidence for each language:")
    for lang, prob in zip(model.classes_, probabilities):
        bar = "" * int(prob * 20)
        print(f"{lang:<12}: {prob:.4f} {bar}")

In [11]:
def explain_prediction(code_snippet):
    print("\n You Entered:\n", code_snippet)

    # Convert user input to same vector format
    input_vector = vectorizer.transform([code_snippet]).toarray()
    input_selected = selector.transform(input_vector)

    # Predict and get probabilities
    prediction = model.predict(input_selected)[0]
    probabilities = model.predict_proba(input_selected)[0]

    print(f"\n Predicted Language: {prediction}")
    print("\n Confidence for each language:")
    for lang, prob in zip(model.classes_, probabilities):
        bar = "" * int(prob * 20)
        print(f"{lang:<12}: {prob:.4f} {bar}")

    # Highlight important words
    print("\n Important words found in your code:")
    input_words = code_snippet.split()
    important_words = feature_names[selector.get_support()]
    matched = False
    for word in input_words:
        if word in important_words:
            print(f" '{word}' is an important feature")
            matched = True
    if not matched:
        print("ℹ️ No important words matched from selected features.")

In [12]:
def predict_language_from_input():
    user_input = input("\n📝 Paste your code snippet below:\n\n")
    explain_prediction(user_input)

#  Start the prediction process
predict_language_from_input()


 You Entered:
 print("helo")

 Predicted Language: Python

 Confidence for each language:
C           : 0.0019 
CSS         : 0.0020 
Java        : 0.0017 
JavaScript  : 0.0017 
Python      : 0.9927 

 Important words found in your code:
ℹ️ No important words matched from selected features.


In [2]:
# 📦 Import required libraries
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report

# 1️⃣ Load all code files from folders
dataset_path = "C:/Users/bhosl/OneDrive/Desktop/sdddi/dataset"
data = []

# Go through each folder (like Python, Java, etc.)
for language in os.listdir(dataset_path):
    language_folder = os.path.join(dataset_path, language)
    if not os.path.isdir(language_folder):
        continue  # Skip if it's not a folder

    # Read every file inside that folder
    for file_name in os.listdir(language_folder):
        file_path = os.path.join(language_folder, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            code = file.read()
            data.append((code, language))  # Store (code, language)

# Create a table from the collected data
df = pd.DataFrame(data, columns=["code", "language"])

# 2️⃣ Convert code to numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)  # Limit to 1000 tokens
X = vectorizer.fit_transform(df["code"]).toarray()
y = df["language"]  # Labels (Python, Java, etc.)
feature_names = vectorizer.get_feature_names_out()

# 3️⃣ Use ANOVA to select the 10 best features for classification
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)
selected_features = feature_names[selector.get_support()]

print("\n✅ Top 10 most useful words (features) from the code:")
print(selected_features)

# 4️⃣ Train a Logistic Regression model to classify languages
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 5️⃣ Show how well the model performs
print("\n📈 Model Performance on Test Data:")
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# 6️⃣ Function to explain the prediction based on user input
def explain_prediction(code_snippet):
    print("\n📥 You Entered:\n", code_snippet)

    # Convert user input to same vector format
    input_vector = vectorizer.transform([code_snippet]).toarray()
    input_selected = selector.transform(input_vector)

    # Predict and get probabilities
    prediction = model.predict(input_selected)[0]
    probabilities = model.predict_proba(input_selected)[0]

    print(f"\n🧠 Predicted Language: {prediction}")
    print("\n📊 Confidence for each language:")
    for lang, prob in zip(model.classes_, probabilities):
        bar = "🟩" * int(prob * 20)
        print(f"{lang:<12}: {prob:.4f} {bar}")

    # Highlight important words
    print("\n🔍 Important words found in your code:")
    input_words = code_snippet.split()
    important_words = feature_names[selector.get_support()]
    matched = False
    for word in input_words:
        if word in important_words:
            print(f"✅ '{word}' is an important feature")
            matched = True
    if not matched:
        print("ℹ️ No important words matched from selected features.")

# 7️⃣ Function to ask user for input and predict language
def predict_language_from_input():
    user_input = input("\n📝 Paste your code snippet below:\n\n")
    explain_prediction(user_input)

# 🚀 Start the prediction process
predict_language_from_input()



✅ Top 10 most useful words (features) from the code:
['class' 'console' 'log' 'out' 'pi' 'print' 'printf' 'println' 'system'
 'void']

📈 Model Performance on Test Data:
              precision    recall  f1-score   support

           C       1.00      1.00      1.00        13
         CSS       0.70      1.00      0.82         7
        Java       1.00      1.00      1.00        10
  JavaScript       1.00      0.91      0.95        11
      Python       1.00      0.78      0.88         9

    accuracy                           0.94        50
   macro avg       0.94      0.94      0.93        50
weighted avg       0.96      0.94      0.94        50


📥 You Entered:
 print("Hello, World!")

🧠 Predicted Language: Python

📊 Confidence for each language:
C           : 0.0195 
CSS         : 0.0268 
Java        : 0.0159 
JavaScript  : 0.0188 
Python      : 0.9190 🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩

🔍 Important words found in your code:
ℹ️ No important words matched from selected features.
