#Predicting type of Stasis based on GitHub data

The notebook contains the source code for predicting different types of stasis
1. The data for two projects is also provided along with the notebook
2. The code has two parts - the first part processes the data and generates labels - non-stasis, non-approval, non-convergence and non-contribution
3. The second part trains the model that generates multiple datapoints for each issue in GitHub
4. The model is evaluated on the test data

In [None]:
import os
import argparse
import numpy as np
import pandas as pd
import sys
from datetime import datetime
##Processing all actions of Development process
MY_WORKSPACE_DIR = "/content/drive/My Drive/DNT_Data/"

from google.colab import drive
drive.mount('/content/drive')

## Processing issue data in github
1. Adds additional information such as time since start, time since creation of the issue
2. Adds event number for each contribution
3. Filters all contribution after the close of an issue

In [None]:

#read the nodes
node_file = MY_WORKSPACE_DIR + 'issue_devdata_flask.csv' # of issue_dev_data_rasa.csv
timestamp_col='date'
dateparse = lambda x: datetime.strptime(str(x), '%d/%m/%Y %H:%M')

df_dev = pd.read_csv(node_file ,parse_dates=[timestamp_col], date_parser=dateparse)
df_dev[timestamp_col] = pd.to_datetime(df_dev[timestamp_col])

def extract_timestamp_features(group):
    
    group = group.sort_values(timestamp_col, ascending=False, kind='mergesort')
    
    tmp = group[timestamp_col] - group[timestamp_col].shift(-1)
    tmp.fillna(pd.Timedelta(seconds=0),inplace=True)
    group["timesincelast"] = tmp.apply(lambda x: float(x / np.timedelta64(1, 'D'))) # D for days
    

    tmp = group[timestamp_col] - group[timestamp_col].iloc[-1]
    tmp=tmp.fillna(pd.Timedelta(seconds=0))
    group["timesincestart"] = tmp.apply(lambda x: float(x / np.timedelta64(1, 'D'))) # D for days
    #tmp=tmp.fillna(pd.Timedelta(seconds=0),inplace=True)

    group = group.sort_values(timestamp_col, ascending=True, kind='mergesort')
    group["event_nr"] = range(1, len(group) + 1)
    
    return group

relevant_action='closed'
def cut_before_action(group):
    relevant_act_idxs = np.where(group['action'] == relevant_action)[0]
    if len(relevant_act_idxs) > 0:
        cut_idx = relevant_act_idxs[0]
        return group[:cut_idx+1]
    else:
        return group



df_dev['userid'] = df_dev['userid'].fillna('NA')
df_dev.index.name=None#
df_dev.reset_index(inplace=True, drop=True)
df_dev = df_dev.groupby('id').apply(extract_timestamp_features)

df_dev.index.name=None
df_dev.reset_index(inplace=True, drop=True)
df_dev = df_dev.sort_values(['id',timestamp_col], kind='mergesort').groupby('id').apply(cut_before_action)
df_dev.index.name=None
df_dev.reset_index(inplace=True, drop=True)
print(df_dev.groupby('id').count())

### Identify the processing time for each issue

In [None]:
#labeling functions for non-convergence, non-approval, non-contribution
df_dev = df_dev.sort_values([timestamp_col], ascending=True, kind='mergesort')
dt_first_last_timestamps = df_dev.groupby('id')[timestamp_col].agg([min, max])
dt_first_last_timestamps.columns = ["start_time", "end_time"]
dt_first_last_timestamps['total_time'] = dt_first_last_timestamps['end_time']-dt_first_last_timestamps['start_time']
dt_first_last_timestamps.drop(['start_time', 'end_time'], axis=1)
dt_first_last_timestamps['total_time'] =dt_first_last_timestamps['total_time'].dt.days
#dt_first_last_timestamps['log_time'] = np.log(dt_first_last_timestamps['total_time'])
df_dev = df_dev.merge(dt_first_last_timestamps['total_time'], on='id', how='left')


### Create labeling functions

We use the following parameters 
Issues that take > median time for closure have some form of stasis - by default it is considered non-approval
1. less # of contributors and # number of contributions - non-contribution
2. High # of contributors and # number of contributions - non-convergence
3. Time taken to merge a pull-request or close the issue - non-approval
4. Store the file and then use it for the next stage of prediction

In [None]:

print(df_dev.groupby('id').count())
df_dev= df_dev[df_dev['total_time']>1]
print(df_dev.groupby('id').count())
ttime = df_dev.quantile([0.50,0.75])['total_time']
print(ttime)

df_num_actions = df_dev.groupby('id').size().reset_index(name='counts')
#df_num_actions.columns=['num_actions']
df_contributors =  df_dev.groupby('id')['userid'].agg(['nunique'])
df_contributors.columns=['num_users']
df_contributors = df_contributors.reset_index()
#print(df_contributors)
#contributors = df_contributors['num_users'].quantile([0.25,0.75])
#actions = df_num_actions['counts'].quantile([0.5,0.75])
#low_act=actions.iloc[0]
#high_act=actions.iloc[1]
#low_contrib = contributors.iloc[0]
#high_contrib = contributors.iloc[1]
#print(low_act, high_act, low_contrib, high_contrib)

df_dev.reset_index(inplace=True, drop=True)
df_dev = df_dev.merge(df_num_actions[['id','counts']], on='id', how='outer')
df_dev = df_dev.merge(df_contributors[['id','num_users']], on='id',  how='outer')

df_dev

In [None]:
df_dev['approve_date'].hist()

### Generate the pre-processed file

In [None]:
#assign class label
#df_dev=df_dev.to_frame()
low_contrib = 3
low_act=5
high_contrib = 5
high_act=12


df_dev["class_label"] = df_dev['total_time'].apply(lambda x: 1 if x >75 else 0)

df_dev['class_label'].loc[ (df_dev['approve_date']>=15)] = 1 #non-approval

df_dev['class_label'].loc[ (df_dev['counts']<=low_act) & (df_dev['num_users']<=low_contrib) & (df_dev['class_label']==1)] = 2  # non-contribution
df_dev.groupby('class_label').count()
df_dev['class_label'].loc[ (df_dev['counts']>=high_act) & (df_dev['num_users']>=high_contrib) & (df_dev['class_label']==1)] = 3 # non-convergence


print(df_dev.groupby('class_label').count())
print(df_dev.groupby('id').count())

#df_dev
df_dev.to_csv(os.path.join(MY_WORKSPACE_DIR, "process_devprocess_flask.csv"), sep=";", index=False)

### Utility functions 
1. Read the data file
2. Add checkpoint data
3. Generate sub-execution data

In [None]:
from datetime import datetime, timedelta

def read_dataset(filename):
# read dataset
  cat_cols=['userid','action','date']
  dtypes = {col:"object" for col in cat_cols}
  for col in ['timesincelast','timesincestart','event_nr','total_time']:
    dtypes[col] = "float"
  data = pd.read_csv(filename, sep=";", dtype=dtypes)
  data['date'] = pd.to_datetime(data['date'])
  return data

num_days=1
def add_checkpoint(group):
    
    frow =group.iloc[[-1]]
    last_event_date = frow.date.copy()
    frow.action='check_point'
    frow.date= group.iloc[0].date + timedelta(days=num_days)
    frow.timesincestart=num_days
    tsincelast = float((frow.date - last_event_date)/ np.timedelta64(1, 'D'))
    frow.timesincelast= tsincelast
    group=group.append(frow)
    return group

def generate_subcontribution_data(data, min_length, max_length, time_span=9):
    # generate sub contribution data (each possible contrib becomes a trace)
    data['case_length'] = data.groupby('id')['action'].transform(len)
    dt_contribs = data[data['case_length'] >= min_length].groupby('id').head(min_length)
    dt_contribs["issue_nr"] = 1
    dt_contribs["orig_id"] = dt_contribs['id']
    for nr_days in range(min_length+time_span, max_length+1, time_span):
      tmp = data[data['timesincestart'] <= nr_days].groupby('id').head(nr_days)
      tmp.reset_index()
      tmp["orig_id"] = tmp['id']
      tmp['id'] = tmp['id'].apply(lambda x: "%s_%s"%(x, nr_days))
      tmp["issue_nr"] = nr_days
      global num_days
      num_days = nr_days
      print(num_days)
      tmp = tmp.groupby('id').apply(add_checkpoint)
      dt_contribs = pd.concat([dt_contribs, tmp], axis=0)
      
        
    dt_contribs['case_length'] = dt_contribs['case_length'].apply(lambda x: min(max_length, x))
        
    return dt_contribs

def split_data(data, train_ratio, split="temporal", seed=20):  
        # split into train and test using temporal split

    grouped = data.groupby('id')
    start_timestamps = grouped['date'].min().reset_index()
    if split == "temporal":
        start_timestamps = start_timestamps.sort_values('date', ascending=True, kind="mergesort")
    elif split == "random":
        np.random.seed(seed)
        start_timestamps = start_timestamps.reindex(np.random.permutation(start_timestamps.index))
    train_ids = list(start_timestamps['id'])[:int(train_ratio*len(start_timestamps))]
    train = data[data['id'].isin(train_ids)].sort_values('date', ascending=True, kind='mergesort')
    test = data[~data['id'].isin(train_ids)].sort_values('date', ascending=True, kind='mergesort')

    return (train, test)

def get_label(data):
    return data.groupby('id').first()['class_label']

def get_issue_lengths(data):
        return data.groupby('id').last()["issue_nr"]

### Feature encoding 

In [None]:
#encoder to encode it using aggregation - you may end up with too many resources.

from sklearn.base import TransformerMixin
import pandas as pd
import numpy as np
from time import time
import sys

class AggregateTransformer(TransformerMixin):
    
    def __init__(self, case_id_col, cat_cols, num_cols, boolean=False, fillna=True):
        self.case_id_col = case_id_col
        self.cat_cols = cat_cols
        self.num_cols = num_cols
        
        self.boolean = boolean
        self.fillna = fillna
        
        self.columns = None
        
        self.fit_time = 0
        self.transform_time = 0
    
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        start = time()
        
        
        dt_first = X.groupby('id').first()
        
        # add the num_actions and num_users
        dt_contrib = X.groupby(self.case_id_col).size()
        dt_contrib.columns=['actcount']
       
        

        # transform numeric cols
        if len(self.num_cols) > 0:
            dt_numeric = X.groupby(self.case_id_col)[self.num_cols].agg(['mean', 'max', 'sum'])
            dt_numeric.columns = ['_'.join(col).strip() for col in dt_numeric.columns.values]
            
        # transform cat cols
        dt_transformed = pd.get_dummies(X[self.cat_cols])
        dt_transformed[self.case_id_col] = X[self.case_id_col]
        del X
        if self.boolean:
            dt_transformed = dt_transformed.groupby(self.case_id_col).max()
        else:
            dt_transformed = dt_transformed.groupby(self.case_id_col).sum()
        
        # concatenate
        if len(self.num_cols) > 0:
            dt_transformed = pd.concat([dt_transformed, dt_numeric], axis=1)
            dt_transformed = pd.concat([dt_transformed, dt_contrib], axis=1)
            del dt_numeric
            del dt_contrib
       
        # fill missing values with 0-s
        if self.fillna:
            dt_transformed = dt_transformed.fillna(0)
            
        # add missing columns if necessary
        if self.columns is None:
            self.columns = dt_transformed.columns
            
        else:
            missing_cols = [col for col in self.columns if col not in dt_transformed.columns]
            for col in missing_cols:
                dt_transformed[col] = 0
            dt_transformed = dt_transformed[self.columns]
        
        self.transform_time = time() - start
        return dt_transformed
    
    def get_feature_names(self):
        return self.columns

### Experiments - Predicting statis
To run this step, the preprocessed files should have been generated
The following steps are performed:
1. The data is split into train and test
2. The sub-contributions for each issue is extracted
3. Generate the data points by encoding features
4. Data imbalance is addressed by using SMOTE
5. Grid search helps tune the parameters
6. Get the best params and the trained model


In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics import f1_score
from sklearn.pipeline import FeatureUnion, Pipeline
from collections import defaultdict
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE



file_name = os.path.join(MY_WORKSPACE_DIR, "process_devprocess_flask.csv")
data= read_dataset(file_name)
data['userid'] = data['userid'].map(lambda x: str(x).replace('[','').replace(']',''))
print(data.groupby('class_label').count())

train_ratio = 0.70

random_state = 25

parameters={}
parameters = {'learning_rate': [0.25,0.5,0.75],
              'subsample': [0.5,0.75],
               'max_depth': [4,6,10],
                'colsample_bytree': [0.25,0.75],
                'min_child_weight': [1,3,5]}




train, test = split_data(data, train_ratio, split="temporal")


dt_test_contrib = generate_subcontribution_data(test, 1, 100)
dt_train_contrib = generate_subcontribution_data(train, 1, 100)

dt_train_contrib.to_csv(os.path.join(MY_WORKSPACE_DIR, "subissues_flask.csv"), sep=";", index=False)

preds_all = []
test_y_all = []
nr_events_all = []

dt_train_bucket = dt_train_contrib
dt_test_bucket=dt_test_contrib
train_y = get_label(dt_train_bucket)
test_y = get_label(dt_test_contrib)
unique, counts = np.unique(train_y, return_counts=True)
print(unique, counts)
unique, counts = np.unique(test_y, return_counts=True)
print(unique, counts)


atx=AggregateTransformer('id', ['action','userid'] , ['timesincelast', 'timesincestart'])

#Flask - 0.8596004575521757
#Flask - {'colsample_bytree': 0.75, 'learning_rate': 0.5, 'max_depth': 10, 'min_child_weight': 1, 'subsample': 0.75}
cls = xgb.XGBClassifier(objective='multi:softprob',
                        num_class=4,
                        n_estimators=100,
                        learning_rate= 0.5,
                        subsample=0.75,
                        max_depth=10,  #10
                        colsample_bytree=0.25,  #0.25
                        min_child_weight=1,#1
                        seed=random_state)


x_train=atx.fit_transform(dt_train_bucket)
# transform the dataset
print('Training', x_train)
# making sure there is data balancing
oversample = SMOTE()
x_new, y_new = oversample.fit_resample(x_train, train_y)
x_newdf = pd.DataFrame(x_new, columns=x_train.columns)

##########Required for grid search
#print(x_train.columns)
#cv = GridSearchCV(cls, parameters, scoring = 'f1_macro',cv=3, n_jobs= -1)
#cv.fit(x_newdf, y_new)   
#print(cv.best_score_)    
#print(cv.best_params_) 
#########################
#cls = cv.best_estimator_

cls.fit(x_newdf, y_new)    
test_all_grouped = dt_test_contrib.groupby('id')           
 # predict separately for each issue

### process the results based on days passed since creation of issue

In [None]:


for _, group in test_all_grouped:
    pr_len = get_issue_lengths(group)                    
    test_y_all.extend(get_label(group))
    x_test = atx.fit_transform(group)
    x_testdf = pd.DataFrame(x_test, columns=atx.get_feature_names())
    pred = cls.predict(x_testdf)
    #print(pred)
    preds_all.extend(pred)
    nr_events_all.extend(pr_len)

num_days = []
f1_list = []
f1_label=[]

      
dt_results = pd.DataFrame({"actual": test_y_all, "predicted": preds_all, "nr_events": nr_events_all})

for nr_events, group in dt_results.groupby("nr_events"):
    if len(set(group.actual)) < 2:
        print(nr_events, " macof1", np.nan)
    else:
        score=f1_score(group.actual, np.round(group.predicted), average='weighted')
        cls_report = classification_report(group.actual, np.round(group.predicted), output_dict=True)
        print( nr_events, "macrof1", score )
        num_days.append(nr_events)
        f1_list.append(score)
        f1_label.append("weighted")
        print(cls_report)
        for label,val in cls_report.items():
          if type(val)==dict:
            f1_list.append(val['f1-score'])
            f1_label.append(label)
            num_days.append(nr_events)

#print(f1_score(dt_results.actual, np.round(dt_results.predicted)), average='weighted'   )

dt_plot = pd.DataFrame({"num_days": num_days, "label": f1_label, "score": f1_list})


import seaborn as sns # for data visualization
import matplotlib.pyplot as plt # for data visualization
%matplotlib inline
 
# Draw line plot of 
sns.lineplot(x = "num_days", y = "score", data = dt_plot, hue = "label",
            style = "label", palette = "hot", dashes = False, legend="brief",)
 
plt.title("F1-score of prediction", fontsize = 20) # for title
plt.xlabel("Number of days", fontsize = 15) # label for x-axis
plt.ylabel("f1-score", fontsize = 15) # label for y-axis
plt.show()
