# RQ4: How frequently do developers use the conversion functions?


## Data Preparation

In [None]:
from pathlib import Path

import pandas as pd
# extracted compressed results
!unzip -o ../../data/sfconvertbot_pr_metadata.csv.zip -d ../../data
# Load the dataset
df = pd.read_csv(Path('../../data/sfconvertbot_pr_metadata.csv'))
print("Number of PRs: ", len(df))

# remove rows with missing values
df = df.dropna()
print("Number of PRs after removing missing values: ", len(df))

# remove rows with invalid JSONs
df = df[df['discussion_metadata'].str.startswith('{')]
print("Number of PRs after removing invalid JSONs: ", len(df))

# add a date column
df['date'] = df['time'].str.split('T').str[0]
df['date'] = df['date'].str.split(' ').str[0]
# ensure date column is a datetime object
df['date'] = pd.to_datetime(df['date'])
# filter in place, keep only commits before Oct 31 2024 (including)
df = df[df['date'] <= pd.Timestamp(2024, 10, 31)]
print("Number of PRs after filtering by date: ", len(df))

# remove the extracted zip file
!rm ../../data/sfconvertbot_pr_metadata.csv

## Identify Relevant Discussions

Criteria:
1. PR has at least two non-empty comment.
2. PR has at least two different authors (not counting the SFConvertBot) engaging in the discussion.
3. The average comment size is more than 4 words.

In [76]:
from nb_utils import RESULTS_DIR
from tqdm import tqdm
import json

min_num_authors = 2
min_num_comments = 2
min_avg_comment_size = 0

output_file = RESULTS_DIR / 'sfconvertbot_pr_metadata_filtered.csv'
# create a dataframe to store the filtered PRs
df_filtered = pd.DataFrame(columns=df.columns)
# create lambda function that checks whether string.lower() == 'sfconvertbot'
is_sfconvertbot = lambda x: x.lower() == 'sfconvertbot'

# iterate over dataframe
for index, row in tqdm(df.iterrows(), total=len(df)):
    # parse as JSON (it should be a string)
    discussion = json.loads(row['discussion_metadata'])['discussion']
    # extract events and original author
    events = discussion['events']
    original_author = discussion['author']
    # criteria:
    num_words, num_non_empty_comments = 0, 0
    authors = set()
    if not is_sfconvertbot(original_author["name"]):
        authors.add(original_author["name"])

    for event in events:
        # print("\t",event)
        event_type = event['type']
        if event_type == 'comment' and not event['data']['hidden']:
            event_author = event['author']
            num_non_empty_comments += 1
            if not is_sfconvertbot(event_author["name"]):
                authors.add(event_author["name"])
            num_words += len(event['data']['latest']['raw'].split())

    should_include = (num_non_empty_comments >= min_num_comments and
                      len(authors) >= min_num_authors and
                      num_words / num_non_empty_comments >= min_avg_comment_size)
    # add the row to the filtered dataframe
    if should_include:
        df_filtered.loc[len(df_filtered)] = row
        title = f"{discussion['title']} by {original_author['name']} ({row['model_id']})"
        df_filtered.loc[len(df_filtered) - 1, 'title'] = title
        df_filtered.loc[len(df_filtered) - 1, 'num_comments'] = num_non_empty_comments
        df_filtered.loc[len(df_filtered) - 1, 'num_authors'] = len(authors)
        df_filtered.loc[len(df_filtered) - 1, 'avg_comment_size'] = num_words / num_non_empty_comments
        df_filtered.loc[len(df_filtered) - 1, 'authors'] = ";".join(authors)
        
# add a source column
df_filtered['source'] = 'SFConvertBot PRs'
# rename PR URL column to url
df_filtered.rename(columns={'pr_url': 'url'}, inplace=True)
# rename discussion_metadata to json_content
df_filtered.rename(columns={'discussion_metadata': 'json_content'}, inplace=True)
# reorder columns
df_filtered = df_filtered[['source', 'title', 'url', 'num_comments', 'num_authors', 'avg_comment_size', 'authors']]
# save the filtered dataframe
df_filtered.to_csv(output_file, index=False)
print("Number of PRs after filtering: ", len(df_filtered))
print("Saved to: ", output_file)

100%|██████████| 45072/45072 [00:04<00:00, 10550.82it/s]

Number of PRs after filtering:  15
Saved to:  ../../results/sfconvertbot_pr_metadata_filtered.csv





## Extracting Conversion Functions

In [None]:
from tqdm import tqdm
import json

# Extract status changes from discussion metadata
for index, row in tqdm(df.iterrows(), total=len(df)):
    status_changes = []
    df.at[index, 'conflicts'] = ""
    try:
        discussion = json.loads(row['discussion_metadata'])
        for event in discussion['discussion']['events']:
            event_type = event['type']
            if event_type == 'status-change':
                status_changes.append(event['data']['status'])
    except:
        pass
    try:
        header_metadata = json.loads(row['header_metadata'])
        df.at[index, 'status'] = header_metadata['discussion']['status']
        # check if there are conflicting files
        conflicting_files = header_metadata['discussion']['filesWithConflicts']
        df.at[index, 'conflicts'] = ";".join(conflicting_files)
    except:
        pass

    # add a column to track status changes
    df.at[index, 'status_changes'] = ";".join(status_changes)
df


In [None]:

import matplotlib.pyplot as plt

status_distribution = df['status'].value_counts()
# Plot the status distribution horizontally
status_distribution.plot(kind='barh', color=['black', 'green', 'red'])

# Add title and labels
plt.title('Pull Request Status Distribution')
plt.xlabel('Status')
plt.ylabel('# Pull Requests')
plt.show()

# show pie chart
status_distribution.plot.pie(autopct='%1.1f%%')

## Merge Conflict Analysis

In [None]:
# how many rows with non-empty values in 'conflicts' column
conflicts = df[df['conflicts'] != ""]

print("Number of PRs: ", len(df))
print("Number of PRs with conflicts: ", len(conflicts))
print("Percentage of PRs with conflicts: ", len(conflicts) / len(df) * 100)
print("Number of PRs without conflicts: ", len(df) - len(conflicts))
print("Percentage of PRs without conflicts: ", (len(df) - len(conflicts)) / len(df) * 100)

# plot status distribution for conflicts
conflicts_status_distribution = conflicts['status'].value_counts()
conflicts_status_distribution.plot(kind='barh', color=['black', 'green', 'red'])
# add title and labels
plt.title('Pull Request Status Distribution with Conflicts')
plt.xlabel('Status')
plt.ylabel('# Pull Requests')
plt.show()
# show pie chart
conflicts_status_distribution.plot.pie(autopct='%1.1f%%')
# add title and labels
plt.title('Pull Request Status Distribution with Conflicts')
plt.show()
