In [None]:
# Import required libraries
import os
import csv
from pydriller import RepositoryMining
from radon.raw import analyze
from radon.metrics import h_visit
from radon.metrics import h_visit_ast
from radon.complexity import sorted_results
from pydriller.git_repository import GitRepository

fields = ['CommitID','filename','complexity','token_count','loc','lloc','sloc','comments',
          'multi','blank','code_comment','h1','h2','N1','N2','vocabulary','length',
          'calculated_length','volume', 'difficulty','effort','time','bugs']

with open('python_dataset.csv', 'w') as csvFile:
    writer = csv.DictWriter(csvFile, fieldnames = fields)
    writer.writeheader()
    for commit in RepositoryMining(#path_to_repo#,
                                   only_modifications_with_file_types=['.py'] ).traverse_commits():
        for modification in commit.modifications:
            filename = modification.filename
            hash_val = commit.hash
            token_count = modification.token_count 
            complexity = modification.complexity
            if filename.endswith(".py"):
                #Calculate static code metrics
                for r, d, f in os.walk(repo_updated):
                    for file in f:
                        if filename in file:
                            file_path = os.path.join(r, file)
                            with open(file_path) as f:
                                content = f.read()
                                file_analyze = analyze(content)
                                code_n_comment = file_analyze.loc+file_analyze.comments
                                file_ast = h_visit(content)
                                data = [{'CommitID':(hash_val), 
                                         'filename':(file_path),
                                         'complexity':(complexity),
                                         'token_count':(token_count),
                                         'loc':(file_analyze.loc),
                                         'lloc':(file_analyze.lloc),
                                         'sloc':(file_analyze.sloc),
                                         'comments':(file_analyze.comments),
                                         'multi':(file_analyze.multi),
                                         'blank':(file_analyze.blank),
                                         'code_comment':(code_n_comment),
                                         'h1':(file_ast.total.h1),
                                         'h2':(file_ast.total.h2),
                                         'N1':(file_ast.total.N1),
                                         'N2':(file_ast.total.N2),
                                         'vocabulary':(file_ast.total.vocabulary),
                                         'length':(file_ast.total.length),
                                         'calculated_length':(file_ast.total.calculated_length),
                                         'volume':(file_ast.total.volume),                     
                                         'difficulty':(file_ast.total.difficulty),
                                         'effort':(file_ast.total.effort),
                                         'time':(file_ast.total.time),
                                         'bugs':(file_ast.total.bugs),}]
                                writer.writerows(data)  
print("writing completed")
csvFile.close()

In [None]:
from pydriller.git_repository import GitRepository
from pydriller import RepositoryMining
gr = GitRepository(#path_to_repo#)

buggy_list = [] 
count = 0
for commit in RepositoryMining(#path_to_repo#,
                                   only_modifications_with_file_types=['.py'] ).traverse_commits():
    if "fix" in commit.msg:
        commit1 = gr.get_commit(commit.hash)
        buggy_commits = gr.get_commits_last_modified_lines(commit1)
        for x in buggy_commits:
            buggy_list.append(x)

In [None]:
#Identify the buggy commits in main file
import pandas as pd
df = pd.read_csv("python_dataset.csv")
df.loc[df['CommitID'].isin(buggy_list)]

In [None]:
# Create new dataframe 
df = pd.DataFrame(df, columns = ['CommitID','filename','complexity','token_count','loc','lloc','sloc','comments',
          'multi','blank','code_comment','h1','h2','N1','N2','vocabulary','length',
          'calculated_length','volume', 'difficulty','effort','time','bugs']) 

# Label datapoints
result = [] 
for value in df["CommitID"]:
    if value in buggy_list:
        result.append(True)
    else:
        result.append(False)
    
df["defect"] = result 
#writing release filter labelled data to CSV
df.to_csv('python_dataset.csv')

In [None]:
#fetch main dataset created and add headers
filepath = 'python_dataset.csv'
column = ['complexity','token_count','loc','lloc','sloc','comments',
          'multi','blank','code_comment','h1','h2','N1','N2','vocabulary','length',
          'calculated_length','volume', 'difficulty','effort','time','bugs','defect']
df1 = pd.read_csv(filepath, usecols= column, index_col=False)
#remove rows where LOC = 0, and complexity = 0
col = ['loc','complexity']
df1 = df1.replace(0, pd.np.nan).dropna(axis=0,how='any',subset=col).fillna(0).astype(int)

#Write labelled data to CSV 
df1.to_csv('python_01.csv')