In [18]:
import pandas as pd 
import numpy as np
import sqlite3
import re

In [3]:
import kagglehub

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
path = kagglehub.dataset_download("simiotic/github-code-snippets")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\sahil\.cache\kagglehub\datasets\simiotic\github-code-snippets\versions\1


In [11]:

conn = sqlite3.connect("snippets-dev.db")
cursor = conn.cursor()

# List all tables
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print("Tables:", cursor.fetchall())

cursor.execute("PRAGMA table_info(snippets);")
columns = cursor.fetchall()

print("Columns in 'snippets':")
for col in columns:
    print(col)

Tables: [('snippets',), ('sqlite_sequence',)]
Columns in 'snippets':
(0, 'id', 'INTEGER', 0, None, 1)
(1, 'code', 'TEXT', 1, None, 0)


In [15]:
df = pd.read_sql_query("SELECT id, code AS snippet FROM snippets LIMIT 1000", conn)


In [19]:
def extract_features(snippet):
    lines = snippet.strip().split('\n')
    num_lines = len(lines)
    num_chars = len(snippet)
    avg_line_length = sum(len(line) for line in lines) / num_lines if num_lines > 0 else 0
    num_keywords = len(re.findall(r"\b(def|class|fit|transform|import|return)\b", snippet))
    num_functions = snippet.count("def")
    num_classes = snippet.count("class")
    
    return {
        "num_lines": num_lines,
        "num_chars": num_chars,
        "avg_line_length": avg_line_length,
        "num_keywords": num_keywords,
        "num_functions": num_functions,
        "num_classes": num_classes,
    }


In [20]:
features_list = []

for _, row in df.iterrows():
    features = extract_features(row["snippet"])
    features["id"] = row["id"]
    features_list.append(features)

features_df = pd.DataFrame(features_list)
print(features_df.head())


   num_lines  num_chars  avg_line_length  num_keywords  num_functions  \
0          1         22             22.0             0              0   
1          1         27             27.0             0              0   

   num_classes  id  
0            0   1  
1            0   2  


In [21]:
def suggest_ml_model(features):
    if features["num_keywords"] >= 2 and features["num_functions"] >= 1:
        return "scikit-learn"
    elif features["num_classes"] >= 1:
        return "TensorFlow"
    elif features["num_lines"] > 20:
        return "XGBoost"
    else:
        return "scikit-learn (simple model)"
features_df["suggested_model"] = features_df.apply(suggest_ml_model, axis=1)
print(features_df.head())


   num_lines  num_chars  avg_line_length  num_keywords  num_functions  \
0          1         22             22.0             0              0   
1          1         27             27.0             0              0   

   num_classes  id              suggested_model  
0            0   1  scikit-learn (simple model)  
1            0   2  scikit-learn (simple model)  
