In [None]:
import json
import pandas as pd
import numpy  as np
import csv
import datetime
from urllib.request import urlopen

today = datetime.datetime.today()

# Show all data 
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

# Read the information in 'committee-info.json' which contains information such as roste_count, description, established, roster, etc.
# The 'committee-info.json' was downloaded from 'https://whimsy.apache.org/public/'
# committees_json = open('committee-info.json').read()

# or get data directly from 'https://whimsy.apache.org/public/committee-info.json'
url = 'https://whimsy.apache.org/public/committee-info.json'
response = urlopen(url) 
committees_json = response.read()
committees_info = json.loads(committees_json) 
committees = committees_info['committees']



Create a datafame, which contains the members' information for each committee: committee name, name, id, date

In [None]:
committer = pd.DataFrame()

for item in committees:
    committee_info = pd.DataFrame.from_dict(committees[item]['roster'],orient='index',columns=['name','date'])
    committee_info['committee'] = item
    committee_info['description'] = committees[item]['description']

    committer = pd.concat([committer,committee_info],axis = 0)

committer['date'] = pd.to_datetime(committer['date'])
committer['year'] = committer['date'].dt.year
committer['month'] = committer['date'].dt.month
committeeList = list(set(committer['committee']))



In [None]:
def completeYearColumn(df):
    # Complete the year from the year the project was created up to this year
    year_list = df['year']
    year_list_new = list(range(year_list.min(),today.year + 1))
    add_list = []
    for y in year_list_new:
        new_list = list(df[df['year'] == y]['new'])
        if len(new_list) > 0:
            add_list.append(new_list[0])
        else:
            add_list.append(0)

    new_df = pd.DataFrame(columns=['year','new','committee'])
    new_df['year'] = year_list_new
    new_df['new'] = add_list
    new_df['committee'] = list(df['committee'])[0]

    return new_df

def addTotalColumn(df):
    # Input a dataframe, output a dataframe with a total column
    # df columns:committee, year,new
    dataframe = df
    year_list = df['year']
    dataframe['total'] = [df[df['year'] <= y]['new'].sum() for y in year_list]
    
    return dataframe

def dataframeToList(df):
    # convert the array to a list
    new_list = [list(df.columns)] +  np.array(df).tolist()
    return new_list


def groupByYear(raw_df):
    # Divide into smaller dataframes by year, then sort by total and convert to a new array
    # '2019':xaixs:[],value:[]
    yearList = list(set(raw_df['year']))
    dic = {}
    for year in yearList:
        dic[year]={}
        df = raw_df[raw_df['year'] == year]
        dic[year]['yAxis'] = list(df['committee'])
        dic[year]['value'] = list(df['total'])
    return dic
    # return new_df


def BoxplotData(raw_df):
    # group the dataframe by committee, year and form a new dataframe
    df = raw_df.groupby(['committee','year'])['name'].count().to_frame()
    df.reset_index(inplace=True) 
    df.rename(columns={'name':'new'},inplace=True)
    # then merge
    result_df = pd.DataFrame()
    for committee_name in list(set(df['committee'])):
        # split the dataframe into sub-dataframes divided by committee
        sub_df = df[df['committee'] == committee_name]
        # Add the missing years
        sub_df = completeYearColumn(sub_df)
        # Add a total column
        sub_df = addTotalColumn(sub_df)
        # Merge them to target dataframe
        result_df = pd.concat([result_df,sub_df],axis=0,ignore_index=True)

    result_dict = groupByYear(result_df)

    # turn the dataframe to a list
    data_list = dataframeToList(result_df)

    return data_list, result_dict

boxplot, bar = BoxplotData(committer)


summarise the committee information, including the year the committee was created, description, name, total number of people to date, etc.

In [None]:
from pandas import NA

# Extracts the creation time of the committee  e.g. "12/2000, reestablished 10/2002" => 12/2000
def extractDate(x):
    if x:
        return x.replace(',',';').split(';')[0]
    else:
        return NA

committees = pd.DataFrame.from_dict(committees,orient='index',columns=[ "display_name","established","roster_count"])
committees['established'] = pd.to_datetime(committees['established'].map(extractDate))
committees['description'] = [committees_info['committees'][index]['description'] for index in committees.index]
committees.reset_index(inplace=True)
committees.rename(columns={'index':'committee_name'},inplace=True)

Process the data to graph the annual growth in committee size: yaxis, xaixs, value=[[x1,y1,size1],[x1,y2,size2]]

Since the total number of committees was too large for the graph to display, the data was filtered， removed the committees with the maximum annual growth of less than 15 from the display

In [None]:
# Merge the year and committee columns to create a new column for easy grouping of data


from unittest import result


def dataFilter(raw_df):
    
    committee_list = raw_df['committee']
    drop_list = []
    for committee_name in committee_list:
        sub_df = raw_df[raw_df['committee'] == committee_name] 
        if sub_df['size'].max() < 15:
            drop_list.append(committee_name)
            index = raw_df[raw_df['committee'] == committee_name].index[0]
            raw_df = raw_df.drop(index=index)
    return raw_df

def chartData(raw_df):

    yAxis_value = list(set(raw_df['committee']))
    xAxis_value = list(set(raw_df['year']))
    xAxis_value.sort()
    size_value = []
    for index, row in raw_df.iterrows():
        size_value.append([yAxis_value.index(row['committee']),xAxis_value.index(row['year']),row['size']])
    
    return {
        'yAxis':yAxis_value,
        'xAxis':xAxis_value,
        'size':size_value
    }

def getTag(str):
    if pd.isna(str) == False:
        return str[0].upper()
    else:
        return ''
    
def addTag(raw_df):
    df = raw_df
    df['tag'] = df['committee'].apply(getTag)
    return df

def getData(raw_df):
    df = raw_df.groupby(['committee','year'])['name'].count().to_frame()
    df.reset_index(inplace=True) #turn the grouped indexes into columns
    df.rename(columns={'name':'size'},inplace=True)
    result_df = chartData(df)

    return result_df

def ScatterData(raw_df):
    df = addTag(raw_df)
    tag_list = list(set(df['tag']))
    result = {}
    for tag in tag_list:
        sub_df = df[df['tag'] == tag]
        sub_dict = getData(sub_df)
        result[tag] = sub_dict
    return result
        
    
scatter = ScatterData(committer)


Subchart: monthly growth graph for the selected committee 

In [None]:

def completeMonthColumn(df):
    # Complete the year-months from the date of creation to the present
    df['year_month'] = df['year'].map(str) +'-' +df['month'].map(str)
    result_dict = {}

    for committee in committeeList:
        sub_df = df[df['committee'] == committee]
        year_list = sub_df['year']
        date_list = list(sub_df['year_month'])
        date_list_new = [str(year) + '-' + str(month) for year in range(int(year_list.min()),today.year + 1) for month in range(1,13)]
        add_list =[]
        for date in date_list_new:
            if date in date_list:
                add_list.append(list(sub_df[sub_df['year_month'] == date]['add'])[0])
            else:
                add_list.append(0)
        result_dict[committee] = {
            'xAxis':date_list_new,
            'values':add_list,
            'description':committees_info['committees'][committee]["description"],
            'established':committees_info['committees'][committee]["established"]
        }
        

    return result_dict
            
def lineData(raw_df):
    df = raw_df.groupby(['committee','year','month'])['name'].count().to_frame()
    df.reset_index(inplace = True)
    df.rename(columns={'name':'add'},inplace=True)
    result_dict = completeMonthColumn(df)
    return result_dict
    

committee_detail_dict = lineData(committer)

获取podlings数据

In [None]:
podlings_url = 'https://whimsy.apache.org/public/public_podlings.json'
response = urlopen(podlings_url) 
podlings_json = response.read()
podlings_info = json.loads(podlings_json) 
Podlings = podlings_info['podling']
# 可以输出至json文件中用于查询

In [None]:
podlings = pd.DataFrame()

def getY_M(date_str):
    if pd.isna(date_str) == False:
        year = date_str.split('-')[0]
        month = date_str.split('-')[1]
        return '-'.join([year,month])
    else:
        return str(today.year)+'-'+ str(today.month) + '-' + str(today.day)


podlings_status_info = pd.DataFrame.from_dict(Podlings,orient='index')[['name','status','startdate','enddate']]
podlings_status_info['startmonth'] = pd.to_datetime(podlings_status_info['startdate'].map(getY_M))
podlings_status_info['endmonth'] = pd.to_datetime(podlings_status_info['enddate'].map(getY_M))
podlings_dateList = [str(year) + '-' + str(month) for year in range(int(podlings_status_info['startmonth'].min().year),today.year + 1) for month in range(1,13)]
Podlings_yearList = [str(year) for year in range(int(podlings_status_info['startmonth'].min().year),today.year + 1)]



# dataMin = podlings_status_info['startdate'].min().strftime('%Y-%m')
# dateList=[]


In [None]:
# def getCurrent(date_str,raw_df):
#     year = date_str.split('-')[0]
#     month = date_str.split('-')[1]
#     df = raw_df[(raw_df['startmonth'].dt.date <= datetime.date(int(year),int(month),1)) & (raw_df['endmonth'].dt.date >= datetime.date(int(year),int(month),1))]
#     return df.count()['name'].item()

# def getGraduated(date_str,raw_df):
#     year = date_str.split('-')[0]
#     month = date_str.split('-')[1]
#     df = raw_df[(raw_df['status'] == 'graduated') & (raw_df['endmonth'].dt.date <= datetime.date(int(year),int(month),1))]
#     return - df.count()['name'].item()

# def getRetired(date_str,raw_df):
#     year = date_str.split('-')[0]
#     month = date_str.split('-')[1]
#     df = raw_df[(raw_df['status'] == 'retired') & (raw_df['endmonth'].dt.date <= datetime.date(int(year),int(month),1))]
#     return - df.count()['name'].item()

# def status(date_str,raw_df):
#     year = date_str.split('-')[0]
#     month = date_str.split('-')[1]
#     return {
#         'new':list(raw_df[raw_df['startmonth'].dt.date == datetime.date(int(year),int(month),1)]['name']),
#         'graduate':list(raw_df[(raw_df['status'] == 'graduated') & (raw_df['endmonth'].dt.date == datetime.date(int(year),int(month),1))]['name']),
#         'retire':list(raw_df[(raw_df['status'] == 'retired') & (raw_df['endmonth'].dt.date == datetime.date(int(year),int(month),1))]['name'])
#     }
# def BarData_DATE():
#     BarData={}
#     BarData['dateList'] = podlings_dateList
#     BarData['current'] =[]
#     BarData['graduated'] = []
#     BarData['retired'] = []
#     BarData['status']={}
#     for date in podlings_dateList:
#         BarData['current'].append(getCurrent(date,podlings_status_info))
#         BarData['graduated'].append(getGraduated(date,podlings_status_info))
#         BarData['retired'].append(getRetired(date,podlings_status_info))
#         BarData['status'][date] = status(date,podlings_status_info)
#     return BarData

# BarData_DATE = BarData_DATE()

In [None]:
def getCurrent(year,raw_df):
    df = raw_df[(raw_df['startmonth'].dt.year == int(year))]
    return list(df['name'])

def getGraduated(year,raw_df):
    df = raw_df[(raw_df['status'] == 'graduated') & (raw_df['endmonth'].dt.year == int(year))]
    return list(df['name'])

def getRetired(year,raw_df):
    df = raw_df[(raw_df['status'] == 'retired') & (raw_df['endmonth'].dt.year == int(year))]
    return list(df['name'])


def BarData_YEAR():
    BarData={}
    BarData['yearList'] = Podlings_yearList
    BarData['new'] =[]
    BarData['graduated'] = []
    BarData['retired'] = []
    BarData['status'] = {}
    for year in Podlings_yearList:
        current = getCurrent(year,podlings_status_info)
        graduated = getGraduated(year,podlings_status_info)
        retired = getRetired(year,podlings_status_info)
        BarData['new'].append(len(current))
        BarData['graduated'].append(-len(graduated))
        BarData['retired'].append(-len(retired))
        BarData['status'][year]={
            'new':current,
            'graduated':graduated,
            'retired':retired
        }


    return BarData

BarData_YEAR = BarData_YEAR()

store the data


In [None]:
committer_dic = {
    'scatter':scatter,
    'boxplot': boxplot,
    'bar':bar,
    "committee_detail":committee_detail_dict,
    "BarData_YEAR":BarData_YEAR
}

file_name = 'committer.json'

with open(file_name,'w') as file_obj:
    json.dump(committer_dic,file_obj)