In [None]:
!pip install -r requirements.txt

In [None]:
import pandas as pd
import numpy as np
import io
import os
import math
import statistics
import tokenize
import keyword
from tempfile import NamedTemporaryFile
import argparse


In [None]:
file_path = "./data/test_dataset_python.csv"

In [None]:
df = pd.read_csv(file_path)
df.head()

In [None]:
def extract_features(code_snippet):
    """
    Extract lexical features from a given code snippet.
    This function temporarily writes the code snippet to a file, then applies the lexical feature extractor functions.
    """
    features = {}
    
    with NamedTemporaryFile(delete=False, mode='w', suffix='.py') as tmp:
        tmp.write(code_snippet)
        tmp_path = tmp.name

    # Ensure the number of characters is calculated based on the code snippet directly
    number_of_characters = len(code_snippet.replace('\n', ''))

    # Call feature extraction functions directly
    features['log_of_numTokens'] = numTokens(tmp_path, number_of_characters)
    features['avg_line_length'] = lineLength(tmp_path)
    features['stdDev_line_length'] = stdevLineLength(tmp_path)
    features['log_of_numComments'] = numComments(tmp_path, number_of_characters)
    features['log_of_numFunctions'] = numFunctions(tmp_path, number_of_characters)
    features['dict_log_of_numKeywords'] = numKeywords(tmp_path, number_of_characters)
    features['dict_TF_wordUnigrams'] = tfwordUnigram(tmp_path)
    features['average_params'] = avgParams(tmp_path)
    features['stdDev_numParams'] = stdevNumParams(tmp_path)
    features['log_of_python_keywords'] = pythonkeywords(tmp_path, number_of_characters)
    features['log_of_numLiterals'] = numLiterals(tmp_path, number_of_characters)
    
    # Clean up temporary file
    os.remove(tmp_path)
    
    return features


In [None]:
# Import necessary libraries
import pandas as pd
import subprocess
import tempfile
import os

# Assuming df is your original DataFrame

# Function to run lexical parser and get the output vector
def run_lexical_parser(file_path):
    # Construct the command to run the lexical parser, assuming the script is named LexicalParser.py
    # Update the path to LexicalParser.py as needed
    command = ['python3', './LexicalParser.py', file_path]
    
    # Run the command and capture the output
    result = subprocess.run(command, capture_output=True, text=True)
    
    # Extract the output vector from the result, trimming brackets and whitespace
    output_vector = result.stdout.strip()[1:-1]  # Remove the leading '[' and trailing ']'
    
    # Split the string by ',' to get individual numbers as strings
    output_vector = output_vector.split(',')
    
    # Convert the string of numbers into a list of floats, trimming extra whitespace around numbers
    features = [float(num.strip()) for num in output_vector]
    
    # Return the features
    return features


In [None]:
# Initialize a list to hold new rows for the enhanced dataset
new_rows = []

# Iterate over the dataset
for index, row in df.iterrows():
    code = row['func_code_string']  # Assuming this is the column with code snippets
    repo_name = row['repository_name']
    
    # Save code to a temporary file
    with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.py') as tmp:
        tmp.write(code)
        tmp_path = tmp.name
    
    # Run lexical parser and get features
    features = run_lexical_parser(tmp_path)
    
    # Map the features to their respective names (update these names based on your actual features)
    features_dict = {
        'log_of_numTokens': features[0],
        'avg_line_length': features[1],
        'stdDev_line_length': features[2],
        'log_of_numFunctions': features[3],
        'average_params': features[4],
        'stdDev_numParams': features[5],
        'log_of_python_keywords': features[6],
        'log_of_numLiterals': features[7],
        'elif': features[8], 
        'if': features[9],
        'else': features[10],
        'for': features[11],
        'while': features[12],
    }
    
    # Prepare a new row for the enhanced dataset
    new_row = {
        'repo_name': repo_name,
        'code': code,
        **features_dict
    }
    
    # Append the new row to the list
    new_rows.append(new_row)
    
    # Clean up the temporary file
    os.remove(tmp_path)

# Create a new DataFrame with the enhanced dataset
enhanced_df = pd.DataFrame(new_rows)

# Display the first few rows of the enhanced DataFrame
print(enhanced_df.head())

# Optionally, save the new DataFrame to a CSV file
enhanced_df.to_csv('./data/enhanced_dataset.csv', index=False)


# Train model - Rando Forest

In [4]:
import pandas as pd
import numpy as np

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from sklearn.metrics import classification_report, accuracy_score

In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold

In [None]:
# Load your dataset
df = pd.read_csv('./data/enhanced_dataset.csv')
df.head()

In [None]:
df.info()

In [None]:
# Separate features and target variable
X = df.drop(['repo_name', 'code'], axis=1)
y = df['repo_name']

# Encode the labels (A, B, None) to integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
# For LSTM, we need categorical labels
y_categorical = to_categorical(y_encoded)

In [None]:
# For Random Forest
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
# For LSTM
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X, y_categorical, test_size=0.2, random_state=42)

In [None]:
# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_classifier.predict(X_test)

# Evaluation
print("Random Forest Classifier Report")
print(classification_report(y_test, y_pred_rf))


### Confidence Representation

In [None]:
# Assuming rf_classifier is your trained Random Forest model and X_test is your test dataset

# Obtain prediction probabilities for each class
probabilities = rf_classifier.predict_proba(X_test)

# Calculate the maximum prediction probability for each sample
max_probabilities = probabilities.max(axis=1)

# Convert probabilities to percentages
confidence_percentages = max_probabilities * 100

# Optionally, create a DataFrame to display the test samples alongside their prediction confidence
test_samples_with_confidence = pd.DataFrame({
    'Test Sample Index': X_test.index,
    'Prediction Confidence (%)': confidence_percentages
})

test_samples_with_confidence

## Validating the Random Forest model

### Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score

# Configure the cross-validation procedure
cv = 5  # Number of folds
scores = cross_val_score(rf_classifier, X, y_encoded, scoring='accuracy', cv=cv, n_jobs=-1)

print(f'Accuracy scores for {cv}-fold cross-validation:')
for i, score in enumerate(scores, 1):
    print(f"Fold {i}: {score:.4f}")
print(f"Mean accuracy: {scores.mean():.4f}")

### Precision-Recall Curve

In [None]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Splitting dataset for a single train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Fit model
rf_classifier.fit(X_train, y_train)

# Predict probabilities
y_scores = rf_classifier.predict_proba(X_test)[:, 1]  # score for the positive class

precision, recall, thresholds = precision_recall_curve(y_test, y_scores)

# Plot the precision-recall curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.', label='Random Forest')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

### ROC Curve and AUC

In [None]:
from sklearn.metrics import roc_curve, auc

# Compute ROC curve and ROC area
fpr, tpr, _ = roc_curve(y_test, y_scores)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

### Feature Importance

In [None]:
feature_importances = pd.DataFrame(rf_classifier.feature_importances_,
                                   index = X.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)

print(feature_importances)

### Stratified K-Fold Cross-Validation

In [None]:
from sklearn.model_selection import StratifiedKFold

# Configure the cross-validation procedure
cv = StratifiedKFold(n_splits=5)

scores = cross_val_score(rf_classifier, X, y_encoded, scoring='accuracy', cv=cv, n_jobs=-1)

print(f'Accuracy scores for stratified {cv.get_n_splits()}-fold cross-validation:')
print(scores)
print(f"Mean accuracy: {scores.mean():.4f}")

## Save the model

In [None]:
from joblib import dump

# Save the model to disk
model_filename = 'test_random_forest_model.joblib'
dump(rf_classifier, model_filename)

## Demonstration

In [1]:
from joblib import load

# Load the model from disk
rf_classifier_loaded = load("./models/test_random_forest_model.joblib")

In [2]:
def predict_with_confidence(model, features):
    features = np.array(features).reshape(1, -1)
    
    # Obtain prediction probabilities and the predicted class
    probabilities = model.predict_proba(features)
    predicted_class = model.predict(features)[0]  # model.predict returns an array, get the first item
    
    max_probability = np.max(probabilities)
    confidence_percentage = max_probability * 100
    
    # Print the predicted class along with the confidence of the prediction
    print(f"Predicted class: {predicted_class} with confidence: {confidence_percentage:.2f}%")


In [7]:
# Example usage:
example_features_0 = [-1.611416152446210, 38.92307692307690, 18.517836169765000, -6.226536669287470, 2.0, 0.0, -5.53338948872752, -4.1470951276076300, 0.0, 0.0, 0.0, 0.0, 0.0]
example_features_1 = [-2.0555360208262800, 49.6, 27.762905960443400, -6.899723107284870, 2.0, 0.0, -5.5134287461649800, -5.801110818616760, 0.0, -6.899723107284870, -6.899723107284870, 0.0, 0.0]
predict_with_confidence(rf_classifier_loaded, example_features_0)
predict_with_confidence(rf_classifier_loaded, example_features_1)

Predicted class: 0 with confidence: 100.00%
Predicted class: 1 with confidence: 97.00%




In [8]:
""" Code 0 - ID 1
def show_clock_output_clock_time_timezone(self, **kwargs):
        \"""Auto Generated Code
        \"""
        config = ET.Element("config")
        show_clock = ET.Element("show_clock")
        config = show_clock
        output = ET.SubElement(show_clock, "output")
        clock_time = ET.SubElement(output, "clock-time")
        timezone = ET.SubElement(clock_time, "timezone")
        timezone.text = kwargs.pop('timezone')

        callback = kwargs.pop('callback', self._callback)
        return callback(config)
"""


""" Code 1 - ID 2536
def _get_asset_id_with_enclosure(self, enclosure_id):
        \"""Create an Asset with an enclosed foreign object.

        This is here to support AssetCompositionSession.set_asset. May need
        to add this in other objects to support other osid.Containable objects.
        return: (osid.id.Id) - the id of the new Asset

        \"""
        mgr = self._get_provider_manager('REPOSITORY')
        query_session = mgr.get_asset_query_session_for_repository(self._catalog_id, proxy=self._proxy)
        query_form = query_session.get_asset_query()
        query_form.match_enclosed_object_id(enclosure_id)
        query_result = query_session.get_assets_by_query(query_form)
        if query_result.available() > 0:
            asset_id = query_result.next().get_id()
        else:
            create_form = self.get_asset_form_for_create([ENCLOSURE_RECORD_TYPE])
            create_form.set_enclosed_object(enclosure_id)
            asset_id = self.create_asset(create_form).get_id()
        return asset_id
"""

' Code 1 - ID 2536\ndef _get_asset_id_with_enclosure(self, enclosure_id):\n        """Create an Asset with an enclosed foreign object.\n\n        This is here to support AssetCompositionSession.set_asset. May need\n        to add this in other objects to support other osid.Containable objects.\n        return: (osid.id.Id) - the id of the new Asset\n\n        """\n        mgr = self._get_provider_manager(\'REPOSITORY\')\n        query_session = mgr.get_asset_query_session_for_repository(self._catalog_id, proxy=self._proxy)\n        query_form = query_session.get_asset_query()\n        query_form.match_enclosed_object_id(enclosure_id)\n        query_result = query_session.get_assets_by_query(query_form)\n        if query_result.available() > 0:\n            asset_id = query_result.next().get_id()\n        else:\n            create_form = self.get_asset_form_for_create([ENCLOSURE_RECORD_TYPE])\n            create_form.set_enclosed_object(enclosure_id)\n            asset_id = self.create_

In [9]:
# Testing changing the code style
"""
prompt: change this code somehow, keep the exact functionality, but not recognizable as it is on a code stylometry analysis. change its style:
"""

"""  
def config_and_invoke_callback(self, **params):
    \""" Changed code - Repo 0 - ID 1
    Refactored Configuration Setup
    \"""
    root_config = ET.Element("configuration")
    clock_display = ET.Element("clockDisplay")
    root_config.append(clock_display)
    clock_output = ET.SubElement(clock_display, "outputSection")
    time_display = ET.SubElement(clock_output, "timeDisplay")
    local_timezone = ET.SubElement(time_display, "localTimezone")
    local_timezone.text = params.get('timezone')

    # Remove 'timezone' to avoid passing it further accidentally
    params.pop('timezone', None)
    # Get the callback function, default to internal if not provided
    invoke_callback = params.get('callback', self.default_callback)
    # Remove 'callback' to clean up params
    params.pop('callback', None)

    return invoke_callback(root_config)
"""
changed_code_0 = [-1.8310298730568912, 40.25, 22.313260159541198, -6.690842277418564, 2.0, 0.0, -5.592229988750454, -4.388257184424518, 0.0, 0.0, 0.0, 0.0, 0.0]
predict_with_confidence(rf_classifier_loaded, changed_code_0)

Predicted class: 0 with confidence: 95.00%




In [10]:
"""
def fetch_asset_identifier_by_enclosure(self, enclosure_identifier):
    \""" Changed code - Repo 1 - ID 2536
    Generates an asset ID for a given enclosure ID.

    Useful in scenarios where asset compositions need to be managed or adjusted.
    Ensures compatibility with various osid.Containable objects as required.
    
    :return: The unique identifier of the newly associated or created asset.
    \"""
    repository_manager = self.obtain_repository_manager('REPOSITORY')
    asset_query_session = repository_manager.obtain_asset_query_session_for_repository(self.catalogue_id, proxy=self.proxy_setting)
    asset_search_criteria = asset_query_session.construct_asset_query()
    asset_search_criteria.constrain_by_enclosure(enclosure_identifier)
    found_assets = asset_query_session.query_assets_using_criteria(asset_search_criteria)
    
    if found_assets.has_next():
        new_asset_id = found_assets.next_item().identify()
    else:
        asset_creation_form = self.initiate_asset_creation([ENCLOSURE_RECORD_TYPE])
        asset_creation_form.define_enclosure(enclosure_identifier)
        new_asset_id = self.commit_new_asset(asset_creation_form).identify()
    return new_asset_id
"""

changed_code_1_1 = [-2.2378349092458842, 53.68181818181818, 34.58651865578947, -7.074116816197362, 2.0, 0.0, -5.687822455077471, -6.380969635637417, 0.0, -7.074116816197362, -7.074116816197362, 0.0, 0.0]
predict_with_confidence(rf_classifier_loaded, changed_code_1_1)

Predicted class: 1 with confidence: 97.00%




In [11]:
""" 
def retrieve_asset_id(self, enclosure_key):
    \""" Changed code - Repo 1 - ID 2536
    Retrieves or creates an asset linked to a specific enclosure, ensuring
    compatibility and manageability within asset compositions.

    :param enclosure_key: Unique identifier for the enclosure.
    :return: Identifier of the associated or newly created asset.
    \"""
    asset_id = self._search_existing_asset(enclosure_key)
    if not asset_id:
        asset_id = self._create_and_fetch_asset_id(enclosure_key)
    return asset_id

def _search_existing_asset(self, enclosure_key):
    repository_service = self._fetch_repository_service('REPOSITORY')
    asset_finder = repository_service.start_asset_search(self.catalog_id, self.proxy)
    asset_finder.set_enclosure_criteria(enclosure_key)
    found_assets = asset_finder.execute_search()
    
    return found_assets.first().id if found_assets.count() > 0 else None

def _create_and_fetch_asset_id(self, enclosure_key):
    asset_builder = self._initiate_asset_creation_process([ENCLOSURE_RECORD_TYPE])
    asset_builder.assign_enclosure(enclosure_key)
    return self._finalize_asset_creation(asset_builder).id

def _fetch_repository_service(self, service_name):
    return self._access_provider_manager(service_name)

def _initiate_asset_creation_process(self, record_types):
    creation_interface = self.prepare_asset_creation_interface(record_types)
    return creation_interface

def _finalize_asset_creation(self, builder):
    return builder.complete_creation()
"""

changed_code_1_2 = [-1.9040404277304552, 41.42857142857143, 27.68763337695742, -5.487559366186565, 2.0, 0.0, -5.487559366186565, -6.18070654674651, 0.0, -6.586171654854675, -7.27931883541462, 0.0, 0.0]
predict_with_confidence(rf_classifier_loaded, changed_code_1_2)

Predicted class: 1 with confidence: 86.00%




In [12]:
""" Changed code - Repo 1 - ID 2536
class AssetManager:
    def __init__(self, repository_identifier, proxy):
        self.repository_identifier = repository_identifier
        self.proxy = proxy

    def _obtain_repository_manager(self, service_type='REPOSITORY'):
        # Simulates fetching a service manager for repositories; abstracted for clarity.
        return MockRepositoryManager(service_type)

    def _query_for_asset(self, enclosure_id):
        with self._repository_session_context(self.repository_identifier, self.proxy) as session:
            asset_query = session.formulate_asset_query()
            asset_query.filter_by_enclosure(enclosure_id)
            return session.query_assets(asset_query)

    def _repository_session_context(self, repository_id, proxy):
        # Context manager to encapsulate session management for querying or creating assets.
        manager = self._obtain_repository_manager()
        return manager.session_for_repository(repository_id, proxy)

    def _create_asset_if_missing(self, enclosure_id, assets):
        if assets:
            return assets[0].id  # Assuming assets is a list-like object with asset objects.
        else:
            with self._repository_session_context(self.repository_identifier, self.proxy) as session:
                asset_creation_form = session.asset_creation_template([ENCLOSURE_RECORD_TYPE])
                asset_creation_form.enclosure = enclosure_id
                new_asset = session.create_asset(asset_creation_form)
                return new_asset.id

    def get_or_create_asset_id_by_enclosure(self, enclosure_id):
        existing_assets = self._query_for_asset(enclosure_id)
        return self._create_asset_if_missing(enclosure_id, existing_assets)

# Mock classes to simulate behavior of repository manager and session, for illustrative purposes.
class MockRepositoryManager:
    def __init__(self, service_type):
        self.service_type = service_type

    def session_for_repository(self, repository_id, proxy):
        return MockSession(repository_id, proxy)

class MockSession:
    def __init__(self, repository_id, proxy):
        self.repository_id = repository_id
        self.proxy = proxy

    def __enter__(self):
        # Initialize session resources
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        # Clean up session resources
        pass

    def formulate_asset_query(self):
        # Returns a mock query object
        return MockQuery()

    def query_assets(self, query):
        # Simulate asset querying
        return []

    def asset_creation_template(self, record_types):
        # Returns a form for asset creation
        return MockCreationForm()

    def create_asset(self, creation_form):
        # Simulate asset creation
        return MockAsset()

class MockQuery:
    def filter_by_enclosure(self, enclosure_id):
        pass

class MockCreationForm:
    def __init__(self):
        self.enclosure = None

class MockAsset:
    @property
    def id(self):
        return 'mock_asset_id'


"""

changed_code_1_3 = [-1.7534533703673156, 35.05952380952381, 28.02251727574284, -5.0974923381895225, 2.1666666666666665, 0.8574929257125442, -5.790639518749468, -6.889251807417577, 0.0, -7.987864096085687, -7.987864096085687, 0.0, 0.0]
predict_with_confidence(rf_classifier_loaded, changed_code_1_3)

Predicted class: 1 with confidence: 59.00%




# Train model - LSTM

In [None]:
import numpy as np

# Assuming X is already normalized if necessary
# Reshape input to be 3D [samples, timesteps, features] for LSTM
X_train_lstm_reshaped = np.reshape(X_train_lstm.values, (X_train_lstm.shape[0], 1, X_train_lstm.shape[1]))
X_test_lstm_reshaped = np.reshape(X_test_lstm.values, (X_test_lstm.shape[0], 1, X_test_lstm.shape[1]))

In [None]:
# Define LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(X_train_lstm_reshaped.shape[1], X_train_lstm_reshaped.shape[2])))
model.add(Dense(y_categorical.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train
model.fit(X_train_lstm_reshaped, y_train_lstm, epochs=10, batch_size=32, validation_data=(X_test_lstm_reshaped, y_test_lstm), verbose=2)

# Evaluation
y_pred_lstm = model.predict(X_test_lstm_reshaped)
y_pred_lstm_classes = np.argmax(y_pred_lstm, axis=1)
y_test_classes = np.argmax(y_test_lstm, axis=1)

print("\nLSTM Classifier Report")
print(classification_report(y_test_classes, y_pred_lstm_classes))