In [3]:
print("Setting up the AI Code Detector (Safe Demo Mode with Feature Summary)...")
# libraries
import pandas as pd
import numpy as np
import ast
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import ipywidgets as widgets
from IPython.display import display, HTML

#feature extraction
def calculate_nesting(node):
    if not isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.For, ast.While, ast.If, ast.Try, ast.With)):
        return 0
    current_depth = 1
    max_childdepth = 0
    bodies = []
    if hasattr(node, 'body'): bodies.append(node.body)
    if isinstance(node, ast.If): bodies.append(node.orelse)
    if isinstance(node, ast.Try): bodies.extend([node.handlers, node.orelse, node.finalbody])
    if isinstance(node, ast.ExceptHandler): bodies.append(node.body)
    for body_list in bodies:
        if isinstance(body_list, list):
            for child in body_list:
                max_childdepth = max(max_childdepth, calculate_nesting(child))
        elif body_list is not None:
            max_childdepth = max(max_childdepth, calculate_nesting(body_list))
    return current_depth + max_childdepth

def code_feature(code):
    feature = {}
    feature['num_lines'] = len(code.splitlines())
    try:
        tree = ast.parse(code)
        nodes = [type(n).__name__ for n in ast.walk(tree)]
        node_counts = Counter(nodes)
        feature['Count_FunctionDef'] = node_counts.get('FunctionDef', 0)
        feature['Count_For_While_If'] = node_counts.get('For', 0) + node_counts.get('While', 0) + node_counts.get('If', 0)
        feature['Max_Nesting_Depth'] = calculate_max_nesting_depth(tree)
        feature['Count_TryExcept'] = node_counts.get('Try', 0)
        feature['Avg_Line_Length'] = np.mean([len(line.strip()) for line in code.splitlines() if line.strip()])
        identifiers = [node.id for node in ast.walk(tree) if isinstance(node, (ast.Name, ast.arg)) and hasattr(node, 'id')]
        feature['AvgIdentifier_Length'] = np.mean([len(id) for id in identifiers]) if identifiers else 0
        single_comments = code.count('#')
        multi_comments = code.count('"""') // 2 + code.count("'''") // 2
        feature['Num_Comments'] = single_comments + multi_comments
        feature['Comment_Ratio'] = feature['Num_Comments'] / (feature['num_lines'] or 1)
    except:
        return None
    return feature

#data training

mock_human = [
    "def sum_list(a): return sum(a)",
    "x=1; y=2; if x>y: z=x else: z=y",
    "for i in range(10): print(i)",
] * 10

mock_ai = [
    "def calculate_the_optimized_value(input_data):\n    # The function ensures robust processing.\n    try:\n        result = input_data ** 2\n        return result\n    except Exception as e: print(f'Error: {e}')",
    "class DataProcessor:\n    def __init__(self): self.buffer=1024",
] * 10

df_mock = pd.DataFrame({
    'code': mock_human + mock_ai,
    'label': [0]*len(mock_human) + [1]*len(mock_ai)
})

feature_df_mock = df_mock['code'].apply(code_feature).apply(pd.Series)
df_final_mock = pd.concat([df_mock, feature_df_mock], axis=1).dropna()

tfidf = TfidfVectorizer(max_features=200, token_pattern=r'[a-zA-Z_]\w*', stop_words='english')
tfidf_features_mock = tfidf.fit_transform(df_final_mock['code']).toarray()
tfidf_feature_names = [f'tfidf_{i}' for i in range(tfidf_features_mock.shape[1])]
tfidf_df_mock = pd.DataFrame(tfidf_features_mock, columns=tfidf_feature_names, index=df_final_mock.index)

X_train_mock = pd.concat([df_final_mock.drop(['code','label'], axis=1).fillna(0), tfidf_df_mock], axis=1)
y_train_mock = df_final_mock['label']
feature_columns = X_train_mock.columns

model = LogisticRegression(solver='liblinear', random_state=42, C=1.0)
model.fit(X_train_mock, y_train_mock)

print("Safe Demo Model Ready with Feature Summary.")

#Prediction function(safe mode + feature summary)
def predict_code_authorship(new_code):
    output.clear_output()
    with output:
        print("--- Running Analysis ---")
        feature = code_features(new_code)
        if not feature:
            print(" ERROR: Invalid Python syntax.")
            return
        new_df = pd.DataFrame([feature]).fillna(0)
        new_tfidf = pd.DataFrame(tfidf.transform([new_code]).toarray(), columns=tfidf_feature_names)
        X_new = pd.concat([new_df, new_tfidf], axis=1)
        for col in feature_columns:
            if col not in X_new.columns: X_new[col] = 0
        X_new = X_new[feature_columns]
        prob_ai = model.predict_proba(X_new)[0][1]

        # --- Safe demo adjustment ---
        if feature['num_lines'] < 15 and feature['Count_FunctionDef'] <= 1:
            prob_ai = min(prob_ai, 0.3)
        # ----------------------------

        # Verdict
        if prob_ai > 0.85:
            verdict = " AI-GENERATED CODE DETECTED "
            style = 'background-color:#ffcccc; color:black; font-size:1.2em; padding:10px; border:2px solid red;'
        elif prob_ai > 0.60:
            verdict = " HIGH LIKELIHOOD OF AI ASSISTANCE "
            style = 'background-color:#ffffcc; color:black; font-size:1.2em; padding:10px; border:2px solid orange;'
        else:
            verdict = " HUMAN-WRITTEN CODE (LOW AI LIKELIHOOD) "
            style = 'background-color:#ccffcc; color:black; font-size:1.2em; padding:10px; border:2px solid green;'

        print("\n" + "="*50)
        display(HTML(f'<div style="{style}">{verdict}</div>'))
        print(f"Confidence (AI): {prob_ai:.2f}")

        # --- Feature Summary Table ---
        summary_df = pd.DataFrame({
            'Feature': ['Lines', 'Functions', 'Loops/If', 'Try/Except', 'Max Nesting', 'Comments', 'Comment Ratio', 'Avg Line Len', 'Avg Identifier Len'],
            'Value': [
                feature['num_lines'],
                feature['Count_FunctionDef'],
                feature['Count_For_While_If'],
                feature['Count_Try_Except'],
                feature['Max_Nesting_Depth'],
                feature['Num_Comments'],
                round(feature['Comment_Ratio'], 2),
                round(feature['Avg_Line_Length'],2),
                round(feature['Avg_Identifier_Length'],2)
            ]
        })
        display(HTML("<h4>Feature Summary:</h4>"))
        display(summary_df)
        print("="*50)

#Interactive widget
code_input = widgets.Textarea(
    value='',
    placeholder='Paste your Python code here...',
    description='Code:',
    layout=widgets.Layout(height='250px', width='90%')
)
analyze_button = widgets.Button(description="ANALYZE AND GET VERDICT")
output = widgets.Output()

def analyze_click(b):
    predict_code_authorship(code_input.value)

analyze_button.on_click(analyze_click)

display(HTML("<h2>AI Code Detector (Safe Demo Mode + Feature Summary)</h2>"))
display(code_input, analyze_button, output)


Setting up the AI Code Detector (Safe Demo Mode with Feature Summary)...
Safe Demo Model Ready with Feature Summary.


Textarea(value='', description='Code:', layout=Layout(height='250px', width='90%'), placeholder='Paste your Py…

Button(description='ANALYZE AND GET VERDICT', style=ButtonStyle())

Output()