# Features from GitHub mined data
<!-- Sharif Ahmed -->

Data Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re


In [None]:
prs = pd.read_pickle('../data/mined/all/cass_prs.pkl')
prcs = pd.read_pickle('../data/mined/all/cass_pr_comments.pkl')
iss = pd.read_pickle('../data/mined/all/cass_issues.pkl')
iscs = pd.read_pickle('../data/mined/all/cass_issue_comments.pkl')
commits = pd.read_csv('../data/mined/all/commits.csv')

#### Data Tranformation

In [None]:
for c in ['updated_at','created_at']:
    iss[c]=pd.to_datetime(iss[c]).dt.tz_convert('UTC')
    prs[c]=pd.to_datetime(prs[c]).dt.tz_convert('UTC')
    iscs[c]=pd.to_datetime(iscs[c]).dt.tz_convert('UTC')
    prcs[c]=pd.to_datetime(prcs[c]).dt.tz_convert('UTC')
for c in ['closed_at']:
    iss[c]=pd.to_datetime(iss[c]).dt.tz_convert('UTC')
    prs[c]=pd.to_datetime(prs[c]).dt.tz_convert('UTC')

c= 'merged_at'
prs[c]=pd.to_datetime(prs[c]).dt.tz_convert('UTC')

In [None]:
def expandUserInfo(df):
 df['user_type']= df.user.apply(lambda u: u['type'])
 df['user_login']= df.user.apply(lambda u: u['login'])
 df['user_id']= df.user.apply(lambda u: u['id'])
 return df
iss = expandUserInfo(iss)
iscs = expandUserInfo(iscs)
prs = expandUserInfo(prs)
prcs = expandUserInfo(prcs)

In [None]:
def expandReactions(df):
    rc = ['total_count', '+1', '-1', 'laugh', 'hooray', 'confused', 'heart', 'rocket', 'eyes']
    df = pd.concat([df,df.reactions.apply(pd.Series)[rc].add_prefix('react_')], axis=1)
    df.drop(columns=['reactions'],inplace=True)

    return df
iss = expandReactions(iss)
iscs = expandReactions(iscs)
# NO reaction # prs = expandReactions(prs)
prcs = expandReactions(prcs)

#### Labels

In [None]:
iss['label_counts']=iss.labels.apply(len)
prs['label_counts']=prs.labels.apply(len)

In [None]:
iss['label_names']=iss.labels.apply(lambda ls: [l['name'] for l in ls])
prs['label_names']=prs.labels.apply(lambda ls: [l['name'] for l in ls])

In [None]:
prs.drop(columns=['labels'],inplace=True)
iss.drop(columns=['labels'],inplace=True)


## Non Coding

In [None]:
import re

def is_non_coding_issue(title, description, labels):
    non_coding_keywords = [
        "documentation", "guide", "manual", "user interface", "UX", "design", "layout",
        "styling", "accessibility", "translation", "legal", "licensing", "planning",
        "roadmap", "meeting", "discussion", "question"
    ]

    combined_text = f"{title} {description} {' '.join(labels)}".lower()

    for keyword in non_coding_keywords:
        if re.search(r'\b' + re.escape(keyword) + r'\b', combined_text):
            return True
    return False

In [None]:
iss['is_noncoding']=iss.apply(lambda r: is_non_coding_issue(r['title'], r['body'], r['label_names']), axis =1)

In [None]:
iss[iss['is_noncoding']][['title','label_names','project','milestone','author_association', 'active_lock_reason',
       'draft', 'pull_request', 'body']]

## is BugFixing?

In [None]:
prs['is_bug'] = prs.label_names.apply(lambda ls : "bug" in ' '.join(ls).lower())

In [None]:
iss['is_bug'] = iss.label_names.apply(lambda ls : "bug" in ' '.join(ls).lower())

In [None]:
prs.is_bug.value_counts(1),iss.is_bug.value_counts(1)

In [None]:
iss['tmp-lbl'] = iss.label_names.apply(lambda ls:'; '.join(ls))

In [None]:
prs['tmp-lbl'] = prs.label_names.apply(lambda ls:'; '.join(ls))

## Newcomer Support

In [None]:
iss['is_newcomer_suport']=(iss['tmp-lbl'].str.contains('good', na=False, case=False) | iss['tmp-lbl'].str.contains('help', na=False, case=False))


In [None]:
iss[iss['is_newcomer_suport']][['title','label_names','project']]

## Duplicate/ Deduplicaiton

- label
    - dedup -> duplicate,
    - newcomer -> good, help wanted



In [None]:
iss['is_dup_labeled']=iss['tmp-lbl'].str.contains("duplicate", na=False, case=False)
iss['is_release']=iss['tmp-lbl'].str.contains("release", na=False, case=False)

iss['is_dup_labeled'].value_counts(), iss['is_release'].value_counts()

Now ISCS, instead of ISS

In [None]:
iscs['is_dup_discussed']=iscs.body.str.contains("duplicate of")
iscs['is_dup_discussed'].value_counts()

let's see if PRCS also has such numbers of dup discussion-

In [None]:
prs['tmp-lbl'].str.contains("duplicate", na=False, case=False).sum()

In [None]:
prcs[prcs.body.str.contains("duplicate of")][['body','project']]#.tolist()

## Comment fts

In [None]:
import textstat
def readability_score(s):
    if type(s) != str:
        return None
    return textstat.flesch_reading_ease(s)


In [None]:
iss['readability']=iss.body.apply(readability_score)
iscs['readability']=iscs.body.apply(readability_score)
prs['readability']=prs.body.apply(readability_score)
prcs['readability']=prcs.body.apply(readability_score)

iss['title_readability']=iss.title.apply(readability_score)
prs['title_readability']=prs.title.apply(readability_score)

In [None]:
iscs['readability'].value_counts(1)

In [None]:
import sys
from binpickle import load

sys.path.append('../models')

from models.SentiCR import  SentiCR
senti_cr=SentiCR.SentiCR()

from models.ToxiCR import ToxiCR

sys.path.append('../models/holdon')
um = load('../models/holdon/CRCusefulness.model')

def crc_fts(data,col='comment'):

    data['cr_senti'] = data[col].apply(lambda s: senti_cr.get_sentiment_polarity(s)[0])

    toxicClassifier=ToxiCR.ToxiCR(ALGO="RF", count_profanity=True, remove_keywords=True,split_identifier=True,
                    embedding="tfidf", load_pretrained=True)

    toxicClassifier.init_predictor()
    data['is_toxic']=toxicClassifier.get_toxicity_probability(data[col].tolist()).ravel()

    data['is_useful']=um.predict(data[col].tolist()).ravel()

    return data

In [None]:
prcs=crc_fts(prcs,'body')
iscs=crc_fts(iscs,'body')

### comment quality

In [None]:
iscs[['project','user_type','author_association','readability', 'cr_senti', 'is_toxic', 'is_useful']].groupby(['project','user_type','author_association']).mean().sort_values(by=['project','is_useful'],ascending=False)

In [None]:
prcs[['project','user_type','author_association','readability', 'cr_senti', 'is_toxic', 'is_useful']].groupby(['project','user_type','author_association']).mean().sort_values(by=['project','is_useful'],ascending=False)

In [None]:
prcs[['pull_request_review_id', 'id', 'start_line', 'original_start_line',
       'line', 'original_line', 'original_position', 'position',
       'in_reply_to_id', 'user_id', 'react_total_count', 'react_+1',
       'react_-1', 'react_laugh', 'react_hooray', 'react_confused',
       'react_heart', 'react_rocket', 'react_eyes', 'readability', 'cr_senti',
       'is_toxic', 'is_useful']].corr()['is_toxic'].sort_values()

# BOT users/ content

In [None]:
bot_u = iscs.user.apply(lambda u: 'bot'  in u )
bot_cont= iscs.body.apply(lambda u: '[bot]'  in u )

In [None]:
bot_type = iscs.user_type=='Bot'

In [None]:
bot_u.mean(),bot_cont.mean(), bot_type.mean()

In [None]:
iscs[bot_cont][['user_type','user_login','body']].body.tolist()

In [None]:
iss[iss.user_type=='Bot']

Temporal Features

In [None]:
def get_temporal_features(idf, icdf, idfK='number',icdfK='issue_num'):
    idf['closure_duration'] =idf.closed_at - idf.created_at
    icdf[icdfK] = icdf[icdfK].apply(int)

    idfKeys =[idfK]
    icdfKeys =[icdfK]

    for p_name in idf.project.unique():
        iMask = idf.project==p_name
        icMask = icdf.project==p_name
        ic1st = pd.merge(idf[iMask],icdf.loc[icMask,['id',icdfK,'created_at','user']], left_on=idfKeys,right_on=icdfKeys, how='left')
        ic1st['u_xy_same'] = ic1st.user_x==ic1st.user_y

        # #  self/other's comment
        ic1 = ic1st[[ idfK, icdfK, 'project','title','u_xy_same', 'created_at_x',    'created_at_y',
            'closure_duration',  'locked', 'comments']].groupby(idfKeys).min()
        idf.loc[iMask,'1st_comment']=ic1.created_at_y-ic1.created_at_x

        #  from others
        ic1 = ic1st.loc[~ic1st.u_xy_same,[ idfK,icdfK,'project', 'title','u_xy_same', 'created_at_x',    'created_at_y',
            'closure_duration',  'locked', 'comments']].groupby(idfKeys).min()
        idf.loc[iMask,'1st_response']=ic1.created_at_y-ic1.created_at_x

        # self comment
        ic1 = ic1st.loc[ic1st.u_xy_same,[ idfK,icdfK, 'project','title','u_xy_same', 'created_at_x',    'created_at_y',
            'closure_duration',  'locked', 'comments']].groupby(idfKeys).min()
        idf.loc[iMask,'1st_self_comment']=ic1.created_at_y-ic1.created_at_x

    return idf, icdf

In [None]:

iss, iscs = get_temporal_features(iss,iscs)

In [None]:
prcs['pr_num']= prcs.pull_request_url.apply(lambda s: int(s.split('/')[-1]))

In [None]:
prs, prcs = get_temporal_features(prs,prcs,icdfK='pr_num')

In [None]:
def get_agg_comment_features(idf, icdf, idfK='number',icdfK='issue_num'):

    idf['closure_duration'] =idf.closed_at - idf.created_at
    icdf[icdfK] = icdf[icdfK].apply(int)
    idfKeys =['project',idfK]
    icdfKeys =['project',icdfK]

    aggf = {'user_type':lambda x:x.unique(),
        'user_id':lambda x:x.nunique(),
        'author_association':lambda x:x.unique(),
        'react_+1': 'sum','react_-1': 'sum','react_laugh': 'sum','react_hooray': 'sum','react_confused': 'sum',
        'react_heart': 'sum','react_rocket': 'sum','react_eyes': 'sum','react_total_count':'sum','is_dup_discussed':'sum',
        'readability':'median', 'cr_senti':'median', 'is_toxic':'mean', 'is_useful':'mean'}

    if icdfK != 'issue_num':
        aggf.pop('is_dup_discussed') #PRS dont have

    tdf = icdf.loc[:,['project',icdfK]+list(aggf.keys())].groupby(['project',icdfK]).agg(aggf).reset_index()

    tdf = tdf.rename(columns={'user_type':'u_types', 'user_id':'n_uids', 'author_association': 'u_associations'})
    tdf['n_utypes']= tdf.u_types.apply(len)
    tdf['n_uassociations']= tdf.u_associations.apply(len)

    idf = pd.merge(idf,tdf, left_on=idfKeys,right_on=icdfKeys, how='left', suffixes=('','_from_cmt'))


    return idf, icdf

In [None]:
iss, iscs = get_agg_comment_features(iss,iscs)

In [None]:
iss

In [None]:
prs, prcs= get_agg_comment_features(prs,prcs,icdfK='pr_num')

 # New labels

In [None]:
prs.label_names = prs.label_names.apply(lambda x: set(sorted(x)))
iss.label_names = iss.label_names.apply(lambda x: set(sorted(x)))


In [None]:
lable_types ={ 'P1': set(),
 'P2': set(),
 'P3': set(),
 'P4': set(),
 'P5': set(),
 'P6': set(),
 'P7': set(),
 'P8': set(),
 'P9': set(),
 'P10': set()
}

def is_new_type(p, type_set):
    new= type_set-lable_types[p]
    lable_types[p].update(type_set)
    return new


In [None]:
# prs

In [None]:

prs['new_label'] = prs.apply(lambda r: is_new_type(r['project'],r['label_names']),axis=1)

In [None]:
lable_types ={ 'P1': set(),
 'P2': set(),
 'P3': set(),
 'P4': set(),
 'P5': set(),
 'P6': set(),
 'P7': set(),
 'P8': set(),
 'P9': set(),
 'P10': set()
}

iss['new_label'] = iss.apply(lambda r: is_new_type(r['project'],r['label_names']),axis=1)

In [None]:
prs['new_label'].apply(len).describe()

In [None]:
iss['new_label'].apply(len).describe()

In [None]:
iss[iss.new_label!=set()].sort_values(by='created_at')

In [None]:
prs[prs.new_label!=set()].sort_values(by=['project','created_at'])

# Commits

In [None]:
commits['created_at'] = pd.to_datetime(commits['committer_date'], utc=True).dt.tz_convert('UTC')
commits['authored_at'] = pd.to_datetime(commits['author_date'], utc=True).dt.tz_convert('UTC')




# More features
<!-- 1.1.1.2 -->

In [None]:
prs.loc[prs.milestone.notna(),'milestone_id'] = prs[prs.milestone.notna()].milestone.apply(lambda x: int(x['id']))


In [None]:
prs['merge_duration'] = prs.merged_at - prs.created_at

In [None]:
iss.loc[iss.milestone.notna(),'milestone_id'] = iss[iss.milestone.notna()].milestone.apply(lambda x: int(x['id']))


In [None]:
iss['is_deduplicated']=(iss.is_dup_discussed.apply(lambda n: n>0) | iss.is_dup_labeled)

In [None]:

commits['author_affiliation'] =commits.author_email.apply(lambda s: s.split('@')[-1] if type(s)==str else '')
commits['num_parents']=commits.parents.apply(lambda x: len(x.split()))


## Profiles

In [None]:
profiles = pd.read_csv('../data/profiles_loc.csv')

In [None]:

def addGenderLocation(pdf):
    print(len(pdf))
    pdf = pd.merge(pdf, profiles[['login','country', 'gender']], left_on='user_login', right_on='login', how='left',  suffixes=('_action', '_profile'))
    print(len(pdf))

    return pdf.drop(columns=['login'])

In [None]:
iss = addGenderLocation( iss )
iscs = addGenderLocation( iscs )
prs = addGenderLocation( prs )
prcs = addGenderLocation( prcs )

In [None]:
import numpy as np
from collections import Counter

def shannon_diversity_index(data):
    # Count the frequency of each category
    counts = Counter(data)
    N = len(data)

    # Proportions of each category
    proportions = np.array(list(counts.values())) / N

    # Shannon Diversity Index calculation
    shannon_index = -np.sum(proportions * np.log(proportions))

    return shannon_index

In [None]:
iss.to_csv('../data/interim/fts/iss.csv')
iscs.to_csv('../data/interim/fts/iscs.csv')
prs.to_csv('../data/interim/fts/prs.csv')
prcs.to_csv('../data/interim/fts/prcs.csv')
commits.to_csv('../data/interim/fts/commits.csv')

Thank you!