In [11]:
!pip install openpyxl


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
import pandas as pd
import openpyxl
import sys
sys.path.append("..")
from utils.github_utils import GithubUtils
from utils.java_code_cleaner import JavaCodeCleaner
from utils.code_smells import CodeSmells
from utils.dataset_utils import *

In [2]:
dataset_file_path = "./input/mlcq_dataset.csv"

#! Open & Sort MLCQ_Dataset
originalDf = pd.read_csv(
    dataset_file_path, on_bad_lines="skip").sort_values(by=["sample_id", "reviewer_id", "smell"])

# display(originalDf)

# print("Dataset Columns:", originalDf.columns)

# #! Digitalize Severity Value
severity_digital_values = {
    "none": 0,
    "minor": 1,
    "major": 2,
    "critical": 3
}

originalDf["severity"] = originalDf["severity"].replace(
    severity_digital_values
).astype(int)

# display(originalDf)

# #! Filter Class Instances
classOnlyDf = originalDf[(originalDf["type"] == "class")]

# display(classOnlyDf)

#! Group By CodeName & Smell
groupedBySmellDf = classOnlyDf.groupby(["code_name", "smell"], as_index=False).agg(
    repository=("repository", "first"),
    commit_hash=("commit_hash", "first"),
    file_path=("path", "first"),
    github_url=("link", "first"),
    severity_sum=("severity", "sum"),
    reviewers_count=("reviewer_id", "count")
)

groupedBySmellDf["smelliness_degree"] = groupedBySmellDf.apply(
    lambda row: row["severity_sum"] / row["reviewers_count"], axis=1
)

# Remove Extra #L**-L** from GithubURL
groupedBySmellDf["github_url"] = groupedBySmellDf["github_url"].apply(
    lambda url: url[0:url.rfind("/")]
)



preprocessedDf = groupedBySmellDf

display(preprocessedDf)

  originalDf["severity"] = originalDf["severity"].replace(


Unnamed: 0,code_name,smell,repository,commit_hash,file_path,github_url,severity_sum,reviewers_count,smelliness_degree
0,Foo.Mumble,blob,git@github.com:eclipse/org.aspectj.git,370f291c359cd159c5f3f0abd6e9e53e81234a07,/asm/testdata/simple-coverage/Foo.java,https://github.com/eclipse/org.aspectj/blob/37...,0,1,0.000000
1,Foo.Mumble,data class,git@github.com:eclipse/org.aspectj.git,370f291c359cd159c5f3f0abd6e9e53e81234a07,/asm/testdata/simple-coverage/Foo.java,https://github.com/eclipse/org.aspectj/blob/37...,0,1,0.000000
2,GetObject,blob,git@github.com:awsdocs/amazon-s3-developer-gui...,038ef11c43698df29de7bb128522dc19a1b645ae,/code_examples/java_examples/S3Examples/GetObj...,https://github.com/awsdocs/amazon-s3-developer...,0,1,0.000000
3,GetObject,data class,git@github.com:awsdocs/amazon-s3-developer-gui...,038ef11c43698df29de7bb128522dc19a1b645ae,/code_examples/java_examples/S3Examples/GetObj...,https://github.com/awsdocs/amazon-s3-developer...,0,1,0.000000
4,ICEDocCleaner,blob,git@github.com:eclipse/ice.git,3f6e0265f5b476ff90a660397ce83992944142c4,/utils/ICEDocCleaner/src/ICEDocCleaner.java,https://github.com/eclipse/ice/blob/3f6e0265f5...,2,3,0.666667
...,...,...,...,...,...,...,...,...,...
4393,tv.sage.SageRuntimeException,data class,git@github.com:google/sagetv.git,a35e3a450b4c0134cb097b9e7de76dca08eb6654,/java/tv/sage/SageRuntimeException.java,https://github.com/google/sagetv/blob/a35e3a45...,4,3,1.333333
4394,userguide.clients.EchoNonBlockingClient,blob,git@github.com:apache/axis2-java.git,372582df483eb7991f85b6d0e765aec62339cdb7,/modules/samples/userguide/src/userguide/clien...,https://github.com/apache/axis2-java/blob/3725...,0,1,0.000000
4395,userguide.clients.EchoNonBlockingClient,data class,git@github.com:apache/axis2-java.git,372582df483eb7991f85b6d0e765aec62339cdb7,/modules/samples/userguide/src/userguide/clien...,https://github.com/apache/axis2-java/blob/3725...,0,1,0.000000
4396,websocket.drawboard.Client,blob,git@github.com:apache/tomcat.git,a9c1a0661198d9ba37c1facd8385fe05d538c4ad,/webapps/examples/WEB-INF/classes/websocket/dr...,https://github.com/apache/tomcat/blob/a9c1a066...,0,1,0.000000


In [3]:
#! Cols names:
id_col = "id"
github_url_col = "github_url"
github_raw_url_col = "github_raw_url"
is_blob_col = CodeSmells.isBlob
is_data_class_col = CodeSmells.isDataClass

intermediate_dataset = {
    id_col: [],  # A sequential Id
    github_url_col: [],  # Github normal URL of the instance
    github_raw_url_col: [],  # Github Raw URL of the instance
    is_blob_col: [],  # 1 If Instance has a Blob Smell, Otherwise 0
    is_data_class_col: []  # 1 If Instance has a DataClass Smell, Otherwise 0
}


for i, row in preprocessedDf.iterrows():
    if (i % 2 == 0):
        intermediate_dataset[id_col].append(i // 2 + 1)
        intermediate_dataset[github_url_col].append(row["github_url"])
        intermediate_dataset[github_raw_url_col].append(
            GithubUtils.convert_blob_to_raw_url(row["github_url"]))
        intermediate_dataset[is_blob_col].append(
            1 if row["smelliness_degree"] > 0.5 else 0)
    else:
        intermediate_dataset[is_data_class_col].append(
            1 if row["smelliness_degree"] > 0.5 else 0)


intermediate_dataset_df = pd.DataFrame(intermediate_dataset)


#! Save Intermediate Dataset to a CSV file
intermediate_dataset_csv_path = "./output/mlcq_intermediate_dataset.csv"
intermediate_dataset_df.to_csv(intermediate_dataset_csv_path, index=False)


#! Save Intermediate Dataset to a XLSX file
intermediate_dataset_xlsx_path = "./output/mlcq_intermediate_dataset.xlsx"
intermediate_dataset_df.to_excel(intermediate_dataset_xlsx_path, index=False, engine='openpyxl')

In [None]:
intermediate_dataset_path = "./output/mlcq_intermediate_dataset.csv"

#! Open Intermediate Dataset
intermediateDf = pd.read_csv(
    intermediate_dataset_path, on_bad_lines="skip")

# intermediateDf = intermediateDf[(intermediateDf[CodeSmells.isBlob] == 1) | (
#     intermediateDf[CodeSmells.isDataClass] == 1)]

display(intermediateDf)

mld_items = []
for i, row in intermediateDf.iterrows():
    # if i <= 30:
        source_code = GithubUtils.clone_code(row["github_raw_url"])
        if source_code:
            processed_source_code = JavaCodeCleaner.clean(source_code)

            code_smells = []
            if row["isBlob"] == 1:
                code_smells.append(CodeSmells.Blob)
            if row["isDataClass"] == 1:
                code_smells.append(CodeSmells.DataClass)
            
            item = JsonDatasetUtils.create_item(
                code_smells, processed_source_code
            )
            mld_items.append(item)

Unnamed: 0,id,github_url,github_raw_url,isBlob,isDataClass
0,1,https://github.com/eclipse/org.aspectj/blob/37...,https://raw.githubusercontent.com/eclipse/org....,0,0
1,2,https://github.com/awsdocs/amazon-s3-developer...,https://raw.githubusercontent.com/awsdocs/amaz...,0,0
2,3,https://github.com/eclipse/ice/blob/3f6e0265f5...,https://raw.githubusercontent.com/eclipse/ice/...,1,0
3,4,https://github.com/oracle/oci-java-sdk/blob/76...,https://raw.githubusercontent.com/oracle/oci-j...,0,0
4,5,https://github.com/alibaba/java-dns-cache-mani...,https://raw.githubusercontent.com/alibaba/java...,1,0
...,...,...,...,...,...
2194,2195,https://github.com/google/j2objc/blob/471504a7...,https://raw.githubusercontent.com/google/j2obj...,0,0
2195,2196,https://github.com/eclipse/xtext-xtend/blob/20...,https://raw.githubusercontent.com/eclipse/xtex...,0,0
2196,2197,https://github.com/google/sagetv/blob/a35e3a45...,https://raw.githubusercontent.com/google/saget...,0,1
2197,2198,https://github.com/apache/axis2-java/blob/3725...,https://raw.githubusercontent.com/apache/axis2...,0,0


200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Code Clone
200: SUCCESS Cod

In [32]:
import json

mld_json_file_path = "./output/mlcq_mld_sc.json"

with open(mld_json_file_path, 'w') as mld_json:
    json.dump(mld_items, mld_json)
    mld_json.write('\n')