# Data Analysis Part Andrea

Linking number of edit events since last successful build to probability of build succeeding

In [1]:
import pandas as pd
import numpy as np

# Import cleaned Data


In [23]:
#if cleaned csv available
editEvents = pd.read_csv("data/editEvents.csv")
buildEvents = pd.read_csv("data/buildEvents.csv")
# else run script ()
def parse_timestamp(time):
    if type(time) is str:
        time = time.split('.')[0]
        return pd.Timestamp(time)
buildEvents['timestamp']=buildEvents['timestamp'].apply(parse_timestamp)
editEvents['timestamp']=editEvents['timestamp'].apply(parse_timestamp)

### Edit Events

In [24]:
print(f'No of edit events: {len(editEvents)}')
editEvents.head()

No of edit events: 497459


Unnamed: 0,sessionID,timestamp
0,034dd66a-7d1b-43eb-a245-82f5d1cdf6ca,2016-07-08 16:23:50
1,034dd66a-7d1b-43eb-a245-82f5d1cdf6ca,2016-07-08 16:24:22
2,034dd66a-7d1b-43eb-a245-82f5d1cdf6ca,2016-07-08 16:24:24
3,8d22b7b4-58c5-4101-b754-d3fa842ba62d,2016-07-08 17:48:12
4,8d22b7b4-58c5-4101-b754-d3fa842ba62d,2016-07-08 17:52:09


In [25]:
editEvents.describe()

Unnamed: 0,sessionID,timestamp
count,497459,497459
unique,2874,488049
top,8d0ea603-57cd-4b1f-b3cf-ce39ec9203c7,2016-09-04 23:28:12
freq,17006,4
first,,2016-03-20 12:32:52
last,,2017-02-23 22:03:41


### Build Events

In [26]:
print(f'No of build events: {len(buildEvents)}')
buildEvents.head()

No of build events: 14957


Unnamed: 0,sessionID,timestamp,buildSuccessful
0,8d22b7b4-58c5-4101-b754-d3fa842ba62d,2016-07-08 18:02:25,True
1,8d22b7b4-58c5-4101-b754-d3fa842ba62d,2016-07-08 18:02:47,True
2,ac705dcb-0848-4448-847d-70f0f8f0315a,2016-07-08 18:17:57,True
3,f23af4c6-66b5-4d9f-b6e0-83de319b70c5,2016-07-08 20:06:38,True
4,f23af4c6-66b5-4d9f-b6e0-83de319b70c5,2016-07-08 20:08:21,True


In [27]:
buildEvents.describe()

Unnamed: 0,sessionID,timestamp,buildSuccessful
count,14957,14956,14956
unique,1529,14877,2
top,26f3dadd-c2e3-414b-8c92-78df1dc356c8,2016-04-21 17:41:37,True
freq,154,2,12889
first,,2016-03-20 12:33:59,
last,,2017-02-23 22:03:36,


# Process Data

1. Step: For each sessionID:
    - get timestamp of the last successfull build
    - get timestamp of next build after 
2. Step: For each session ID:
    - Count number of edit between the two timestamp in Step 1
    - if no buildEvent after last successfull build occurs, dont count
3. Step: Create new DataFrame with following variables:
    - sessionID,timestampSuccessBuild, timestampNextBuild, #editsUntilNextBuild and nextBuildResult

In [84]:
def get_all_build(sessionID):
    all_build = buildEvents[buildEvents["sessionID"] == sessionID]
    return all_build.sort_values(["timestamp"]).values


In [85]:
def get_successful_build(sessionID):
    all_successful_build = buildEvents[(buildEvents["sessionID"] == sessionID) & (buildEvents["buildSuccessful"] == True)] 
    return all_successful_build.sort_values(["timestamp"]).values


In [86]:
def get_nr_edits_between_build(sessionID,timeLastSuccessfullBuild, timeNextBuild):
    totalEdit = editEvents[editEvents["sessionID"] == sessionID]
    totalEdit = totalEdit.sort_values(["timestamp"])
    testBetweenSuccessBuildAndBuild = totalEdit[(totalEdit["timestamp"] <= timeNextBuild) & (totalEdit["timestamp"] >= timeLastSuccessfullBuild)]
    return testBetweenSuccessBuildAndBuild["timestamp"].values.size


In [107]:
def edits_from_pass_to_next_build(sessionID):
    result = []

    # get all successful build of session 
    allSuccessfulBuild = get_successful_build(sessionID)
    allBuild = get_all_build(sessionID)
    numOfSuccessfulBuild = np.size(allSuccessfulBuild,0)
    numOfBuild = np.size(allBuild,0)
    # iterate over each passed build
    for s in range(numOfSuccessfulBuild):
        # if build follows, count edits
        if allBuild[-1][1]>allSuccessfulBuild[s][1]:
            for b in range(numOfBuild):
                if allBuild[b][1]>allSuccessfulBuild[s][1]:
                    break
            timeLastSuccessfulBuild=allSuccessfulBuild[s][1]
            timeNextBuild=allBuild[b][1]
            resultNextBuild=allBuild[b][2]
            nrOfEdit=get_nr_edits_between_build(sessionID,timeLastSuccessfulBuild, timeNextBuild)
            sessionID = allBuild[b][0]
            
            result.append([sessionID, timeLastSuccessfulBuild, timeNextBuild, nrOfEdit, resultNextBuild])
            
    return result

In [108]:
edits_from_pass_to_next_build(buildEvents.iloc[6,0])[11]

['9729b737-8fb0-461e-8694-390548199f57',
 Timestamp('2016-07-08 20:26:11'),
 Timestamp('2016-07-08 20:26:13'),
 1,
 True]

### Iterate over all sessions and create DataFrame

In [115]:
final_result=[]
allSessionID=buildEvents.sessionID.unique()
for sessionID in allSessionID[1:100]:
    if len(edits_from_pass_to_next_build(sessionID))>0:
        NrOfObservation=len(edits_from_pass_to_next_build(sessionID))
        for NOO in range(NrOfObservation):
            final_result.append(edits_from_pass_to_next_build(sessionID)[NOO])
#create DataFrame
column_labels = ['sessionID','timestampSuccessBuild','timestampNextBuild', '#editsUntilNextBuild', 'nextBuildResult']
df = pd.DataFrame(final_result, columns=column_labels)
        

In [116]:
df

Unnamed: 0,sessionID,timestampSuccessBuild,timestampNextBuild,#editsUntilNextBuild,nextBuildResult
0,f23af4c6-66b5-4d9f-b6e0-83de319b70c5,2016-07-08 20:06:38,2016-07-08 20:08:21,4,True
1,f23af4c6-66b5-4d9f-b6e0-83de319b70c5,2016-07-08 20:08:21,2016-07-08 20:12:21,3,True
2,9729b737-8fb0-461e-8694-390548199f57,2016-07-08 20:17:56,2016-07-08 20:18:46,6,True
3,9729b737-8fb0-461e-8694-390548199f57,2016-07-08 20:18:46,2016-07-08 20:19:49,2,True
4,9729b737-8fb0-461e-8694-390548199f57,2016-07-08 20:19:49,2016-07-08 20:20:23,3,True
...,...,...,...,...,...
329,99cfee45-afb6-4390-b3ba-80a478a181fd,2016-12-16 12:57:19,2016-12-16 12:57:31,6,True
330,99cfee45-afb6-4390-b3ba-80a478a181fd,2016-12-16 12:57:31,2016-12-16 13:27:05,156,True
331,6666e317-246e-430d-adcb-6af5ff8b57b9,2016-12-16 15:51:35,2016-12-16 15:51:52,7,False
332,d9fec444-23ca-43cf-94c8-d91e2669888a,2016-12-17 09:38:29,2016-12-17 09:39:28,8,True


## Build a Model

In [None]:
#TODO: probability of successful build given number of edits after last successful build