In [76]:
import os
import pandas as pd
import xml.etree.ElementTree as ET
pathn="Homework"

def parse_xml_file(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    problems = root.findall("problem")

    problem_data = {}
    for problem in problems:
        problem_id = problem.find("problem_class").get("id")
        problem_data[problem_id] = problem_data.get(problem_id, 0) + 1

    return problem_data


def process_assignment(assignment_path):
    assignment_name = os.path.basename(assignment_path)
    student_folders = [f for f in os.listdir(assignment_path) if f.lower().startswith("student")]
    problem_names = set()
    student_data = {}
    for student_folder in student_folders:
        student_id = int(student_folder.split(" ")[1])
        student_files = os.listdir(os.path.join(assignment_path, student_folder))
        problem_counts = {}
        for file in student_files:
            if file.endswith(".xml"):
                xml_path = os.path.join(assignment_path, student_folder, file)
                problem_data = parse_xml_file(xml_path)
                for problem_name, count in problem_data.items():
                    problem_counts[problem_name] = problem_counts.get(problem_name, 0) + count
                    problem_names.add(problem_name)
        student_data[student_id] = problem_counts
    return assignment_name, student_data, problem_names


def create_table(assignment_paths):
    problem_names = set()
    table_data = {}
    for assignment_path in assignment_paths:
        assignment_name, student_data, assignment_problem_names = process_assignment(assignment_path)
        problem_names.update(assignment_problem_names)
        for student_id, problem_counts in student_data.items():
            if student_id not in table_data:
                table_data[student_id] = {}
            for problem_name, count in problem_counts.items():
                table_data[student_id][problem_name] = table_data[student_id].get(problem_name, 0) + count

    table_df = pd.DataFrame.from_dict(table_data, orient="index")
    table_df.index.name = "student"
    table_df = table_df.fillna(0)
    table_df = table_df.astype(int)
    table_df = table_df.reindex(sorted(table_df.columns), axis=1)

    # Add a new column that sums up all the previous columns
    table_df["Total"] = table_df.sum(axis=1)

    return table_df



if __name__ == "__main__":
    assignment_paths = [f for f in os.listdir(".") if f.startswith(pathn)]
    table = create_table(assignment_paths)
    print(table)

            AccessStaticViaInstance  AnonymousClassComplexity  \
student                                                         
2        2                        0                         0   
4        1                        0                         2   
6        1                        0                         0   
8        1                        0                         0   
9        1                        0                         0   
10       1                        0                         1   
11       2                        0                         1   
12       1                        0                         1   
13       3                        0                         0   
14       4                        0                         2   
18       2                        0                         3   
19       1                        0                         1   
20       1                        0                         0   
22       1               

In [77]:
# Compute the sum of each column
sums = table.sum()

max_columns = sums.nlargest(10).index.tolist()

# Print the results
print('Sum of each column:')
print(sums)
print('Top 10 columns with the highest sum:', max_columns)

Sum of each column:
                                   33
AccessStaticViaInstance             3
AnonymousClassComplexity           15
AnonymousClassMethodCount           5
BooleanMethodIsAlwaysInverted       2
                                 ... 
UnusedReturnValue                   6
UtilityClass                       20
WrongPackageStatement             187
unused                            542
Total                            3650
Length: 80, dtype: int64
Top 10 columns with the highest sum: ['Total', 'MissingJavadoc', 'unused', 'SingleClassImport', 'CanBeFinal', 'WrongPackageStatement', 'MultipleReturnPointsPerMethod', 'MethodWithMultipleLoops', 'FieldMayBeFinal', 'ChainedMethodCall']


In [78]:
codeSmells=table[max_columns]
codeSmells

Unnamed: 0_level_0,Total,MissingJavadoc,unused,SingleClassImport,CanBeFinal,WrongPackageStatement,MultipleReturnPointsPerMethod,MethodWithMultipleLoops,FieldMayBeFinal,ChainedMethodCall
student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2,270,102,71,14,16,11,7,9,10,0
4,199,34,37,40,14,10,5,7,12,4
6,165,51,15,12,12,8,8,9,10,3
8,81,32,15,2,3,4,5,3,0,3
9,124,38,37,4,8,7,5,5,2,2
10,139,23,24,36,10,8,5,3,4,3
11,143,66,17,4,4,8,9,3,0,4
12,149,18,8,7,20,9,8,4,1,4
13,145,33,2,7,12,12,9,7,6,2
14,239,47,28,18,31,11,10,6,0,0


In [80]:
codeSmells.to_csv('{}.csv'.format(pathn))

In [75]:
for i in range(20):
    if i<10:
        pathn="Assignment-0{}".format(i)
    else:
        pathn="Assignment-{}".format(i)
    if __name__ == "__main__":
        assignment_paths = [f for f in os.listdir(".") if f.startswith(pathn)]
        table = create_table(assignment_paths)
    
    sums = table.sum()

    max_columns = sums.nlargest(10).index.tolist()
    codeSmells=table[max_columns]
    print("saving....")
    print(pathn)
    codeSmells.to_csv('{}.csv'.format(pathn))

saving....
Assignment-00
saving....
Assignment-01
saving....
Assignment-02
saving....
Assignment-03
saving....
Assignment-04
saving....
Assignment-05
saving....
Assignment-06
saving....
Assignment-07
saving....
Assignment-08
saving....
Assignment-09
saving....
Assignment-10
saving....
Assignment-11
saving....
Assignment-12
saving....
Assignment-13
saving....
Assignment-14
saving....
Assignment-15
saving....
Assignment-16
saving....
Assignment-17
saving....
Assignment-18
saving....
Assignment-19


In [89]:
import pandas as pd
import os

# Read the codesmells.csv file
codesmells_df = pd.read_csv('codesmells.csv')

# Iterate over the assignment files from 03 to 19
for assignment_num in range(3, 20):
    if assignment_num == 6:
        continue
    assignment_file = f"Assignment-{assignment_num:02}.csv"
    assignment_path = os.path.join('csvs', assignment_file)
    
    # Read the Assignment-XX.csv file
    assignment_df = pd.read_csv(assignment_path)
    
    # Extract the "student" and "Total" columns from Assignment-XX.csv
    assignment_data = assignment_df[['student', 'Total']]
    
    assignment_data['student'] = 'Student-' + assignment_data['student'].astype(str)

    
    # Rename the "Total" column to "Assignment-XX_cs"
    assignment_data = assignment_data.rename(columns={'Total': f'Assignment-{assignment_num:02}_CS'})
    
    # Merge the assignment_data with codesmells_df using the common "student" column
    codesmells_df = pd.merge(codesmells_df, assignment_data, left_on='Name', right_on='student', how='left')
    
    # Drop the redundant "student" column
    codesmells_df = codesmells_df.drop('student', axis=1)

# Save the updated codesmells.csv file
codesmells_df.to_csv('codesmells_updated.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  assignment_data['student'] = 'Student-' + assignment_data['student'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  assignment_data['student'] = 'Student-' + assignment_data['student'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  assignment_data['student'] = 'Student-' + a

In [97]:
import pandas as pd
import os

# Read the codesmells.csv file
codesmells_df = pd.read_csv('codesmells.csv')

# Iterate over the assignment files from 03 to 19
for assignment_num in range(3, 20):
    if assignment_num == 6:
        continue
    assignment_file = f"Assignment-{assignment_num:02}.csv"
    assignment_path = os.path.join('csvs', assignment_file)
    
    # Read the Assignment-XX.csv file
    assignment_df = pd.read_csv(assignment_path)
    
    # Extract the "student" and "Total" columns from Assignment-XX.csv
    assignment_data = assignment_df[['student', 'Total']]
    
    assignment_data['student'] = 'Student-' + assignment_data['student'].astype(str)

    print('changed')
    # Rename the "Total" column to "Assignment-XX_cs"
    assignment_data = assignment_data.rename(columns={'Total': f'Assignment-{assignment_num:02}_CS'})
    
    # Merge the assignment_data with codesmells_df using the common columns
    codesmells_df = pd.merge(codesmells_df, assignment_data, left_on='Name', right_on='student', how='left')
    print('merging')
    # Drop the redundant "student" column
    codesmells_df = codesmells_df.drop('student', axis=1)

# Save the updated codesmells.csv file
codesmells_df.to_csv('codesmells_updated.csv', index=False)



changed
merging
changed
merging
changed
merging
changed
merging
changed
merging
changed
merging
changed
merging
changed
merging
changed
merging
changed
merging
changed
merging
changed
merging
changed
merging
changed
merging
changed
merging
changed
merging


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  assignment_data['student'] = 'Student-' + assignment_data['student'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  assignment_data['student'] = 'Student-' + assignment_data['student'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  assignment_data['student'] = 'Student-' + a

In [98]:
import pandas as pd
import os

# Read the codesmells.csv file
codesmells_df = pd.read_csv('codesmells.csv')

# Iterate over the assignment files from 03 to 19
for assignment_num in range(3, 20):
    if assignment_num == 6:
        continue
    assignment_file = f"Assignment-{assignment_num:02}.csv"
    assignment_path = os.path.join('csvs', assignment_file)
    
    # Read the Assignment-XX.csv file
    assignment_df = pd.read_csv(assignment_path)
    
    # Extract the "student" and "Total" columns from Assignment-XX.csv
    assignment_data = assignment_df[['student', 'Total']]
    
    assignment_data['student'] = 'Student-' + assignment_data['student'].astype(str)

    
    # Rename the "Total" column to "Assignment-XX_CS"
    assignment_data = assignment_data.rename(columns={'Total': f'Assignment-{assignment_num:02}_CS'})
    
    # Merge the assignment_data with codesmells_df using the common "Name" and "student" columns
    codesmells_df = pd.merge(codesmells_df, assignment_data, left_on='Name', right_on='student', how='left', suffixes=('_codesmells', f'_{assignment_num:02}_assignment'))
    
    # Drop the redundant "student" column
    codesmells_df = codesmells_df.drop('student', axis=1)

# Save the updated codesmells.csv file
codesmells_df.to_csv('codesmells_updated2.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  assignment_data['student'] = 'Student-' + assignment_data['student'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  assignment_data['student'] = 'Student-' + assignment_data['student'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  assignment_data['student'] = 'Student-' + a

In [105]:
import pandas as pd
import os

# Read the codesmells.csv file
codesmells_df = pd.read_csv('codesmells.csv')

# Read the Homework-4.csv file
homework_df = pd.read_csv('csvs/Homework-4.csv')

# Extract the "student" and "Total" columns from Homework-4.csv
homework_data = homework_df[['student', 'Total']]

homework_data['student'] = 'Student-' + homework_data['student'].astype(str)

# Rename the "Total" column to "Homework-4_CS"
homework_data = homework_data.rename(columns={'Total': 'Homework-4_CS'})

# Merge the homework_data with codesmells_df using the common "Name" and "student" columns
codesmells_df = pd.merge(codesmells_df, homework_data, left_on='Name', right_on='student', how='left')

# Drop the redundant "student" column
codesmells_df = codesmells_df.drop('student', axis=1)

# Save the updated codesmells.csv file
codesmells_df.to_csv('codesmells_updated3.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  homework_data['student'] = 'Student-' + homework_data['student'].astype(str)


In [None]:
i have some data