In [256]:
from os import listdir
from os.path import isfile, join 
import numpy as np
import pandas as pd 
import datetime as dt

path="C:/Users/Luis Moros/Documents/GitHub/MAN6915/FinalProject"

In [257]:
def obtainRawFileNames(path,ext):
    extension = ext if "." in ext else ".{}".format(ext)
    return ["{}/{}".format(path,f) 
            for f in listdir(path)
            if isfile(join(path, f)) 
            and f.endswith(extension)]

In [258]:
def normalizedFields(line, separator):
    fields = [field.replace('|', ' ') for field in line.split(',')]
    if len(fields)<4:
        return ""
    return separator.join((fields[0], " ".join(fields[1:-2]), fields[-2], fields[-1]))

In [259]:
def filterEmptyLines(lines):
    return (line for line in lines if line != "")

In [260]:
def replaceComasByPipes(path):
    with open(path, 'r', encoding="Latin-1") as file :
        filedata = file.read()
    
    filelines = filterEmptyLines(
                    (normalizedFields(line, '|') 
                     for line in filedata.split('\n')))

    # Write the file out again
    with open(path.replace('.','_cleaned.'), 'w', encoding="Latin-1") as file:
        file.write("\n".join(filelines))

In [261]:
#print("\n".join(obtainRawFileNames(
#                path,
#                "csv"
#        )))

In [262]:
#for file in obtainRawFileNames(path, "csv"):
#    replaceComasByPipes(file)       

In [263]:
durationDf = pd.read_csv("{}/{}".format(path,"resolution_cleaned.csv"),sep='|')

In [264]:
def calculateDuration(x):
    if len(x) ==1:
        arbitraryMax = ((dt.datetime(2013,1,1) - dt.datetime(1970,1,1)).total_seconds())
        return arbitraryMax - min(x) 
    return max(x) - min(x)

durationDf["when"] = durationDf["when"].apply(pd.to_numeric)
durationDfFinal = pd.DataFrame(durationDf.groupby("id")["when"].apply(calculateDuration))
durationDfFinal.columns = ['duration']

In [265]:
ccDf = pd.read_csv("{}/{}".format(path,"cc_cleaned.csv"),sep='|', na_filter=False)

In [266]:
def calculateTotalCcs(x):
    ccs = x.str.split(';').count()
    return ccs

ccDfFinal = ccDf[ccDf["what"].str.strip()!='']
ccDfFinal = pd.DataFrame(ccDfFinal.groupby("id")["what"].apply(calculateTotalCcs))
ccDfFinal.columns = ['totalCcs']

In [None]:
reportDf = pd.read_csv("{}/{}".format(path,"reports_cleaned.csv"),sep='|')

In [None]:
def calculateOpenMonth(column):
    dates = pd.to_datetime(column,unit='s')
    return dates.map(lambda x : x.month)
    
reportDf["opening"] = reportDf["opening"].apply(pd.to_numeric)
openMonthDfFinal = pd.DataFrame(reportDf.groupby("id")["opening"].apply(calculateOpenMonth))
openMonthDfFinal.columns=['openMonth']

In [None]:
severityDf = pd.read_csv("{}/{}".format(path,"severity_cleaned.csv"),sep='|')

In [None]:
severityDfFinal = pd.DataFrame(severityDf.groupby('id').first()['what'])
severityDfFinal.columns = ['severity']

In [None]:
eventDf = pd.read_csv("{}/{}".format(path,"resolution_cleaned.csv"),sep='|')

In [None]:
eventDfFinal = pd.DataFrame(eventDf.groupby('id').last()['what']\
                            .apply(lambda x: 1 if x=='FIXED' else 0))
eventDfFinal.columns = ['event']

In [None]:
assignedDf = pd.read_csv("{}/{}".format(path,"assigned_to_cleaned.csv"),sep='|')

In [None]:
def obtainAssignee(x):
    names =  x['what'].str.cat(sep=';')
    return names
assignedDf = assignedDf[assignedDf['what']!='None']
assignedDfFinal = pd.DataFrame(assignedDf.groupby('id').apply(obtainAssignee))
assignedDfFinal.columns = ['assignee']

In [None]:
def obtainReporter(x):
    names =  x['reporter'].str.cat(sep=';')
    return names
reportDf['reporter'] = reportDf['reporter'].astype('str')
reporterDfFinal = pd.DataFrame(reportDf.groupby("id").apply(obtainReporter))
reporterDfFinal.columns=['reporter']

In [None]:
finalDF = durationDfFinal\
            .join(ccDfFinal, how='left')\
            .join(openMonthDfFinal, how='inner' )\
            .join(severityDfFinal, how='inner')\
            .join(assignedDfFinal, how='inner')\
            .join(reporterDfFinal, how='inner')\
            .join(eventDfFinal, how='inner')

In [None]:
finalDF.fillna(value=0, inplace=True)

In [None]:
finalDF['openMonth'] = pd.Categorical(finalDF['openMonth'])
finalDF['severity'] = pd.Categorical(finalDF['severity'])

In [None]:
def getTrainPlusTestSets(df):
    """
    Sample train and test sets
    """
    from sklearn.model_selection import train_test_split
    x = df.loc[:,['severity','reporter','totalCcs','duration','openMonth','assignee']] 
    y = df.loc[:,['event']]
    return train_test_split(x, y, test_size=0.3, random_state=1, stratify=x['openMonth'])

In [None]:
x_train, x_test, y_train_final, y_test_final = getTrainPlusTestSets(finalDF)

In [None]:
def getProportion(df, feature):
    def f(x):
        x = x.loc[:,[feature]].join(y_train, how='inner')
        return (x[x['event']==1]).count()/(x.count()+1)
    
    temp = pd.DataFrame(x_train.groupby(feature).apply(f).iloc[:,0])
    temp.columns = [feature+'SuccRate']

    return temp.merge(df, left_index=True,right_on=feature, how='inner')                                                 

In [None]:
x_train_final = getProportion(x_train,'assignee')
x_train_final = getProportion(x_train_final,'reporter')
x_train_final = x_train_final.loc[:,[\
                                     'assigneeSuccRate',\
                                     'reporterSuccRate',\
                                     'severity',\
                                     'totalCcs',\
                                     'duration',\
                                     'openMonth'\
                                    ]]
x_train_final = pd.get_dummies(x_train_final,columns=['openMonth','severity'])

In [None]:
x_train_final

In [None]:
x_test_final = getProportion(x_test,'assignee')
x_test_final = getProportion(x_test_final,'reporter')
x_test_final = x_test_final.loc[:,[\
                                     'assigneeSuccRate',\
                                     'reporterSuccRate',\
                                     'severity',\
                                     'totalCcs',\
                                     'duration',\
                                     'openMonth'\
                                    ]]
x_test_final = pd.get_dummies(x_test_final,columns=['openMonth','severity'])

In [None]:
x_test_final

In [None]:
from sklearn import linear_model

logreg = linear_model.LogisticRegression()
logreg.fit(x_train_final, y_train_final.iloc[:,0].ravel())