In [20]:
import os
import requests
import pandas as pd
import pickle
from sklearn.preprocessing import OneHotEncoder

In [21]:
csv_file_path = '/Users/mac/Desktop/Code_Smell_Detection/dataset/MLCQCodeSmellSamples.csv'  
df = pd.read_csv(csv_file_path)
filtered_df = df[df['smell'] == 'data class']
filtered_df.to_csv('/Users/mac/Desktop/Code_Smell_Detection/dataset/data_class/filtered_dataset_data_class.csv', index=False)
num_rows = filtered_df.shape[0]  
print(f"Number of rows in the filtered dataset: {num_rows}")
print("Filtered CSV has been saved as 'filtered_dataset_data_class.csv'")


Number of rows in the filtered dataset: 4078
Filtered CSV has been saved as 'filtered_dataset_data_class.csv'


In [None]:
def convert_to_raw_url(blob_url):
    if 'github.com' in blob_url and '/blob/' in blob_url:
        raw_url = blob_url.replace('github.com', 'raw.githubusercontent.com').replace('/blob/', '/')
        raw_url = raw_url.split('/#')[0]  
        return raw_url
    else:
        raise ValueError(f"Invalid GitHub blob URL: {blob_url}")



In [None]:
def download_and_extract_lines(raw_url, start_line, end_line, save_path):
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(raw_url, headers=headers, allow_redirects=True, timeout=10)
        response.raise_for_status()       
        print(f"Downloading from: {raw_url}")
        print(f"Extracting lines {start_line} to {end_line}")
        if 'text/plain' not in response.headers.get('Content-Type', ''):
            print(f"Unexpected content type: {response.headers.get('Content-Type')}")
            print(f"Response preview: {response.text[:500]}")
            return False

        file_content = response.text.splitlines() 
        if start_line < 1 or end_line > len(file_content) or start_line > end_line:
            raise ValueError(
                f"Invalid start_line ({start_line}) or end_line ({end_line}) for file: {raw_url}"
            )
        extracted_lines = file_content[start_line - 1:end_line]
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        with open(save_path, 'w', encoding='utf-8') as file:
            file.write('\n'.join(extracted_lines))
        print(f"Downloaded and extracted lines {start_line}-{end_line}: {raw_url} -> {save_path}")
        return True

    except requests.exceptions.RequestException as e:
        print(f"HTTP error while accessing {file_url}: {e}")
    except ValueError as ve:
        print(f"Value error: {ve}")
    except Exception as e:
        print(f"Unexpected error: {e}")
    return False

In [None]:
dataset_path = "/Users/mac/Desktop/Code_Smell_Detection/dataset/data_class/metrics/dataset"
csv_file_path = '/Users/mac/Desktop/Code_Smell_Detection/dataset/data_class/filtered_dataset_data_class.csv'  
df = pd.read_csv(csv_file_path)
filtered_df = df[df['smell'] == 'data class']
os.makedirs(dataset_path, exist_ok=True)

ids = []
classes = []

for _, row in filtered_df.iterrows():
    try:
        file_url = row['link']
        raw_url = convert_to_raw_url(file_url)
        file_id = row['id']
        start_line = row['start_line']
        end_line = row['end_line']
        print(file_url)
        print(raw_url)

        # Define file name and path directly in the dataset_path folder
        file_name = f"{file_id}.java"  # Save as ID with .java extension
        file_path = os.path.join(dataset_path, file_name)

        # Download the file and keep only specified lines
        download_and_extract_lines(raw_url, start_line, end_line, file_path)

        # Update the lists
        if os.path.exists(file_path):
            ids.append(file_id)
            with open(file_path, "r", encoding="ISO-8859-1") as file:
                classes.append(file.read())
        else:
            print(f"File not found for ID {file_id} at {file_path}")

    except Exception as e:
        print(f"Error processing row with ID {row['id']}: {e}")

print("Files added to the 'metrics' folder.")
print(f"Total files processed: {len(ids)}")


In [22]:
df_data_class = pd.read_csv('/Users/mac/Desktop/Code_Smell_Detection/dataset/data_class/data_class.csv')
df_data_class.drop(columns=['method'], inplace=True)
df_data_class.head()

Unnamed: 0,sample_id,severity
0,8077,critical
1,5553,critical
2,9341,critical
3,10419,critical
4,12232,critical


In [23]:
def save_model(file_name, model):
    with open(file_name, 'wb') as file:
        pickle.dump(model, file)

In [24]:
df_metrics = pd.read_csv('/Users/mac/Desktop/Code_Smell_Detection/dataset/data_class/metrics/ck_results/metricsclass.csv')
df_metrics.head()

Unnamed: 0,file,class,type,cbo,cboModified,fanin,fanout,wmc,dit,noc,...,assignmentsQty,mathOperationsQty,variablesQty,maxNestedBlocksQty,anonymousClassesQty,innerClassesQty,lambdasQty,uniqueWordsQty,modifiers,logStatementsQty
0,/Users/mac/Desktop/Code_Smell_Detection/datase...,GatewayReceiverParser,class,5,5,0,5,2,2,0,...,1,0,1,0,0,0,0,22,0,0
1,/Users/mac/Desktop/Code_Smell_Detection/datase...,TestRun$Anonymous1,anonymous,2,2,0,2,6,1,0,...,1,1,1,3,0,0,0,22,-1,0
2,/Users/mac/Desktop/Code_Smell_Detection/datase...,SyntaxTreeBuilderVisitor,interface,143,143,0,143,143,1,0,...,0,0,0,0,0,0,0,158,1,0
3,/Users/mac/Desktop/Code_Smell_Detection/datase...,CouchbaseClusterUtil,class,1,1,0,1,2,1,0,...,1,0,1,0,0,0,0,7,17,0
4,/Users/mac/Desktop/Code_Smell_Detection/datase...,AdminUserCredentials,interface,1,1,0,1,0,1,0,...,0,0,0,0,0,0,0,4,1,0


In [25]:
print("Columns:", df_data_class.columns)
print("Columns in df_metrics:", df_metrics.columns)

Columns: Index(['sample_id', 'severity'], dtype='object')
Columns in df_metrics: Index(['file', 'class', 'type', 'cbo', 'cboModified', 'fanin', 'fanout', 'wmc',
       'dit', 'noc', 'rfc', 'lcom', 'lcom*', 'tcc', 'lcc', 'totalMethodsQty',
       'staticMethodsQty', 'publicMethodsQty', 'privateMethodsQty',
       'protectedMethodsQty', 'defaultMethodsQty', 'visibleMethodsQty',
       'abstractMethodsQty', 'finalMethodsQty', 'synchronizedMethodsQty',
       'totalFieldsQty', 'staticFieldsQty', 'publicFieldsQty',
       'privateFieldsQty', 'protectedFieldsQty', 'defaultFieldsQty',
       'finalFieldsQty', 'synchronizedFieldsQty', 'nosi', 'loc', 'returnQty',
       'loopQty', 'comparisonsQty', 'tryCatchQty', 'parenthesizedExpsQty',
       'stringLiteralsQty', 'numbersQty', 'assignmentsQty',
       'mathOperationsQty', 'variablesQty', 'maxNestedBlocksQty',
       'anonymousClassesQty', 'innerClassesQty', 'lambdasQty',
       'uniqueWordsQty', 'modifiers', 'logStatementsQty'],
      dtype='o

In [26]:
# Extract 'sample_id' from 'file'
df_metrics['sample_id'] = df_metrics['file'].str.extract(r'/(\d+)\.java$')

# Convert both sample_id columns to the same type (string in this case)
df_metrics['sample_id'] = df_metrics['sample_id'].astype(str)
df_data_class['sample_id'] = df_data_class['sample_id'].astype(str)

# Merge the DataFrames
df_merged = pd.merge(df_data_class, df_metrics, on='sample_id', how='inner')

# Display the merged DataFrame
print(df_merged)


     sample_id  severity                                               file  \
0         5553  critical  /Users/mac/Desktop/Code_Smell_Detection/datase...   
1         9341  critical  /Users/mac/Desktop/Code_Smell_Detection/datase...   
2        12232  critical  /Users/mac/Desktop/Code_Smell_Detection/datase...   
3        10234  critical  /Users/mac/Desktop/Code_Smell_Detection/datase...   
4        10234  critical  /Users/mac/Desktop/Code_Smell_Detection/datase...   
...        ...       ...                                                ...   
3312     14935     minor  /Users/mac/Desktop/Code_Smell_Detection/datase...   
3313     13914     minor  /Users/mac/Desktop/Code_Smell_Detection/datase...   
3314     11483     minor  /Users/mac/Desktop/Code_Smell_Detection/datase...   
3315     15154     minor  /Users/mac/Desktop/Code_Smell_Detection/datase...   
3316     15154     minor  /Users/mac/Desktop/Code_Smell_Detection/datase...   

                                         class     

In [28]:
# Save df_merged to a CSV file
df_merged.to_pickle('/Users/mac/Desktop/Code_Smell_Detection/dataset/data_class/metrics/merged/merged_metrics_class.pkl')
print("Merged dataset saved as 'merged_dataset.pkl'")

Merged dataset saved as 'merged_dataset.pkl'


In [29]:
df_metrics_method = pd.read_csv('/Users/mac/Desktop/Code_Smell_Detection/dataset/data_class/metrics/ck_results/metricsmethod.csv')
df_metrics_method.head()

Unnamed: 0,file,class,method,constructor,line,cbo,cboModified,fanin,fanout,wmc,...,assignmentsQty,mathOperationsQty,maxNestedBlocksQty,anonymousClassesQty,innerClassesQty,lambdasQty,uniqueWordsQty,modifiers,logStatementsQty,hasJavaDoc
0,/Users/mac/Desktop/Code_Smell_Detection/datase...,GatewayReceiverParser,getBeanClass/1[Element],False,7,2,0,0,0,1,...,0,0,0,0,0,0,5,4,0,True
1,/Users/mac/Desktop/Code_Smell_Detection/datase...,GatewayReceiverParser,"doParse/3[Element,ParserContext,BeanDefinition...",False,15,3,0,0,0,1,...,1,0,0,0,0,0,10,4,0,True
2,/Users/mac/Desktop/Code_Smell_Detection/datase...,TestRun$Anonymous1,getFailureMessage/0,False,61,1,0,0,0,1,...,1,1,0,0,0,0,10,1,0,False
3,/Users/mac/Desktop/Code_Smell_Detection/datase...,TestRun$Anonymous1,test/0,False,44,0,0,0,0,4,...,0,0,3,0,0,0,14,1,0,False
4,/Users/mac/Desktop/Code_Smell_Detection/datase...,TestRun$Anonymous1,init/1[SWTBot],False,56,1,0,0,0,1,...,0,0,0,0,0,0,6,1,0,False


In [30]:

# Extract 'sample_id' from 'file'
df_metrics_method['sample_id'] = df_metrics_method['file'].str.extract(r'/(\d+)\.java$')

# Convert both sample_id columns to the same type (string in this case)
df_metrics_method['sample_id'] = df_metrics_method['sample_id'].astype(str)

# Merge the DataFrames
df_merged = pd.merge(df_data_class, df_metrics_method, on='sample_id', how='inner')

# Display the merged DataFrame
print(df_merged)

      sample_id  severity                                               file  \
0          5553  critical  /Users/mac/Desktop/Code_Smell_Detection/datase...   
1          5553  critical  /Users/mac/Desktop/Code_Smell_Detection/datase...   
2          5553  critical  /Users/mac/Desktop/Code_Smell_Detection/datase...   
3          5553  critical  /Users/mac/Desktop/Code_Smell_Detection/datase...   
4          5553  critical  /Users/mac/Desktop/Code_Smell_Detection/datase...   
...         ...       ...                                                ...   
19442     15154     minor  /Users/mac/Desktop/Code_Smell_Detection/datase...   
19443     15154     minor  /Users/mac/Desktop/Code_Smell_Detection/datase...   
19444     15154     minor  /Users/mac/Desktop/Code_Smell_Detection/datase...   
19445     15154     minor  /Users/mac/Desktop/Code_Smell_Detection/datase...   
19446     15154     minor  /Users/mac/Desktop/Code_Smell_Detection/datase...   

                                       

In [31]:
# Save df_merged to a CSV file
df_merged.to_pickle('/Users/mac/Desktop/Code_Smell_Detection/dataset/data_class/metrics/merged/merged_metrics_method.pkl')
print("Merged dataset saved as 'merged_metrics_method.csv'")

Merged dataset saved as 'merged_metrics_method.csv'
