In [14]:
from datasets import load_dataset
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import joblib


In [3]:
dataset = load_dataset("code_search_net", "python",trust_remote_code=True)

In [4]:
ml_snippets =[]
for example in dataset["train"]:
    snippet = example["func_code_string"]
    if any(keyword in snippet for keyword in [
        "fit","LinearRegression","xgboost","tensorflow","torch","keras","predict"

    ]):
        ml_snippets.append(snippet)
    if len(ml_snippets) >= 2000:
        break
print("ML-related snippets collected:",len(ml_snippets))

print(ml_snippets[0])

ML-related snippets collected: 2000
def train(train_dir, model_save_path=None, n_neighbors=None, knn_algo='ball_tree', verbose=False):
    """
    Trains a k-nearest neighbors classifier for face recognition.

    :param train_dir: directory that contains a sub-directory for each known person, with its name.

     (View in source code to see train_dir example tree structure)

     Structure:
        <train_dir>/
        ├── <person1>/
        │   ├── <somename1>.jpeg
        │   ├── <somename2>.jpeg
        │   ├── ...
        ├── <person2>/
        │   ├── <somename1>.jpeg
        │   └── <somename2>.jpeg
        └── ...

    :param model_save_path: (optional) path to save model on disk
    :param n_neighbors: (optional) number of neighbors to weigh in classification. Chosen automatically if not specified
    :param knn_algo: (optional) underlying data structure to support knn.default is ball_tree
    :param verbose: verbosity of training
    :return: returns knn classifier that was t

In [22]:
def extract_features(snippets):
    lines = snippets.strip().split('\n')
    num_lines = len(lines)
    num_chars = len(snippets)
    avg_line_length = sum(len(line)for line in lines ) / num_lines if num_lines > 0 else 0

    num_keywords = len(re.findall(r"\b(def|class|fit|predict|compile|train|model|transform)\b",snippet))
    num_functions =snippet.count("def")
    num_classes = snippet.count("class")

    uses_linear_regression = "LinearRegression" in snippet
    uses_xgboost = "xgboost" in snippet
    uses_tensorflow = "tensorflow" in snippet
    uses_keras = "keras" in snippet
    uses_torch = "torch." in snippet or "torch.nn" in snippet or "import torch" in snippet
    uses_sklearn = "sklearn" in snippet
    uses_pandas = "pandas" in snippet or "pd.DataFrame" in snippet
    return {
        "num_lines": num_lines,
        "num_chars": num_chars,
        "avg_line_length": avg_line_length,
        "num_keywords": num_keywords,
        "num_functions": num_functions,
        "num_classes": num_classes,
        "uses_linear_regression": uses_linear_regression,
        "uses_xgboost": uses_xgboost,
        "uses_tensorflow": uses_tensorflow,
        "uses_keras": uses_keras,
        "uses_torch": uses_torch,
        "uses_sklearn": uses_sklearn,
        "uses_pandas": uses_pandas
    }

In [23]:
features_list = []

for snippet in ml_snippets:
    features = extract_features(snippet)
    features_list.append(features)

features_df = pd.DataFrame(features_list)
features_df.head()

Unnamed: 0,num_lines,num_chars,avg_line_length,num_keywords,num_functions,num_classes,uses_linear_regression,uses_xgboost,uses_tensorflow,uses_keras,uses_torch,uses_sklearn,uses_pandas
0,63,2670,41.396825,5,2,10,False,False,False,False,False,False,False
1,40,2209,54.25,5,1,8,False,False,False,False,False,False,False
2,29,1132,38.068966,2,2,0,False,False,False,False,False,False,False
3,11,462,41.090909,2,1,1,True,False,False,False,False,False,False
4,12,499,40.666667,4,1,1,True,False,False,False,False,False,False


In [24]:
def suggest_ml_model(row):
    if row["uses_linear_regression"]:
        return "Linear Regression (scikit-learn)"
    elif row["uses_xgboost"]:
        return "XGBoost"
    elif row["uses_tensorflow"]:
        return "TensorFlow"
    elif row["uses_keras"]:
        return "Keras"
    elif row["uses_torch"]:
        return "PyTorch"
    elif row["uses_sklearn"]:
        return "Scikit-learn (general)"
    else:
        return "Other / Unknown"

# Apply to entire DataFrame
features_df["suggested_model"] = features_df.apply(suggest_ml_model, axis=1)

# Show value counts to check label distribution
features_df["suggested_model"].value_counts()


suggested_model
Other / Unknown                     1253
PyTorch                              385
TensorFlow                           170
Keras                                 89
Scikit-learn (general)                56
XGBoost                               28
Linear Regression (scikit-learn)      19
Name: count, dtype: int64

In [29]:
x = features_df.drop(columns=["suggested_model"])
y = features_df["suggested_model"]

le = LabelEncoder()
y_encoded = le.fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42,stratify=y_encoded)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))

joblib.dump(clf, "ml_model_suggestion_rf.pkl")
joblib.dump(le, "label_encoder.pkl")

                                  precision    recall  f1-score   support

                           Keras       1.00      1.00      1.00        18
Linear Regression (scikit-learn)       1.00      0.75      0.86         4
                 Other / Unknown       1.00      1.00      1.00       251
                         PyTorch       1.00      1.00      1.00        77
          Scikit-learn (general)       0.92      1.00      0.96        11
                      TensorFlow       1.00      1.00      1.00        34
                         XGBoost       1.00      1.00      1.00         5

                        accuracy                           1.00       400
                       macro avg       0.99      0.96      0.97       400
                    weighted avg       1.00      1.00      1.00       400



['label_encoder.pkl']

In [30]:
print("torch" in pytorch_code)
print("uses_torch:", extract_features(pytorch_code)["uses_torch"])


True
uses_torch: False


In [32]:
# PyTorch code example
pytorch_code = '''
import torch
import torch.nn as nn
import torch.optim as optim

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc = nn.Linear(10, 1)

    def forward(self, x):
        return self.fc(x)

model = Net()
optimizer = optim.Adam(model.parameters(), lr=0.001)
'''

# Extract features
pytorch_features = pd.DataFrame([extract_features(pytorch_code)])

# Predict
pred = clf.predict(pytorch_features)
pred_label = le.inverse_transform(pred)[0]

print("🔮 Suggested ML Model / Library:", pred_label)


🔮 Suggested ML Model / Library: Other / Unknown
