In [None]:
#Imports
import pandas as pd
import os.path
import random
import time

#All Function definitions

def generateIDs(keyname, numberOfQuestions):
    '''generates a unique ID for each question using the beginning characters of each question'''
    goodIDs=[]
    with open(keyname) as file:  
        data = file.read()
    for n in range(1,numberOfQuestions+1):
            #print("processing: "+str(n))
            locs=[]
            occursat=find_all(data, str(n)+')')
            for ii in occursat:
                if data[ii-1]=='\r' or data[ii-1]=='\n'  and data[ii+3]!="_":# check for both /r and /n
                    locs.append(ii)
                    #print(locs)
            z=lambda x: str('0')+str(x) if x<10 else str(x)
            theID=z(n)+"_"+(data[locs[0]+3:locs[0]+25]).strip()
            goodIDs.append(theID)
    return goodIDs

def unduplicate(dup_list):
    '''finds duplicate entries in a list and adds characters to duplicated entries to make them unique.
    works in a roundabout way by converting to a dataframe first , and then converting back to a list in the end'''
    df = pd.DataFrame(dup_list)
    dups=df[df.duplicated(keep='first')]
    if not dups.empty:
        dup_indices=list(dups.index)
        for i in dup_indices:
            df.iat[i,0]=df.iat[i,0]+str(i)
            print("Unduplicating: "+ df.iat[i,0])
    undup_list=list(df.values.flatten())
    return undup_list

def createQuestionIDs(numberOfQuestions):
    '''this will generateIDs unique question IDs for each question from the raw keys'''
    allIDs=[]
    for n in range(0,6):
        filename="key"+str(n)+".txt"
        if (os.path.isfile(filename)): 
            qid=generateIDs(filename,numberOfQuestions)
            allIDs.append(qid)
    allIDs[0]=unduplicate(allIDs[0])
 #The ID's generated are slightly different in each version, so set them all equal to the ID's in version0           
    for n in range(1,numberOfVersions):
        for m in range(0,numberOfQuestions):
            sub=allIDs[0][m][4:20]
            matching = [s for s in allIDs[n] if sub in s]
            found_in=allIDs[n].index(matching[0])
            if allIDs[n][found_in]!=allIDs[0][m]:
                #print(QIDs[n][found_in]+"   changed to    "+QIDs[0][m])
                allIDs[n][found_in]=allIDs[0][m]
    
    return allIDs

def find_all(a_str, sub):
    '''finds all instances of a substring in a string'''
    start = 0
    result=[]
    while True:
        start = a_str.find(sub, start)
        if start == -1: return result
        result.append(start)
        start += len(sub) # use start += 1 to find overlapping matches
    return result

def cleanKey(key_name):
    '''retrieves just the "key" part of exported test. 
    Just reads and returns the key if the questions have already been deleted 
    writes to a new file with name cleankey'''
    if(os.path.isfile(key_name)):
        #print("Scrubbing: "+key_name+" which is :" + str(type(key_name)))
        with open(key_name) as file:  
            data = file.read()
            p=data.rfind('_____')
            if p!=-1: 
                data=data[p+5:]#delete all the junk before "_______"
                data=data[data.find('1)'):]#keep all the stuff after the 1)
    endline=['\rID:', '\nID:']
    for pp in endline:
        if data.find(pp)!=-1:# remove the question ID to a separate column if necessary
            data = data.replace(pp,'\t')
    with open('cleaned_'+key_name, 'w') as file:
        file.write(data)
        
def getAllKeys():
    '''read all files named key0-key4, make them into strings and and return a dictionary
    containing a list of keys and also the number of questions'''
    keylist=[]

    for n in range(0,6):
        filename="key"+str(n)+".txt"
        #print("processing: "+filename)
        if os.path.isfile(filename): 
            cleanKey(filename)#always create a fresh cleankey, to avoid using an outdated one
            keys=pd.read_table("cleaned_"+filename, header=None,usecols=[0])
            keylist.append(makekey(keys))
    numberOfQuestions=keys.shape[0]
    getAllKeysOutput={"keylist":keylist,"numberOfQuestions":numberOfQuestions}
    return getAllKeysOutput


def makekey(key_from_test):
    '''key_from_test is a dataframe formed from reading the answerkey file. This function converts it into a text string with numbers 0-5 representing A-E'''
    #keydictionary={"A":"0","B":"1","C":"2","D":"3","E":"4"}
    thekey=""
    for i in range(0,key_from_test.shape[0]):
        #print("At i="+str(i))
        #get the last letter, i.e. the "C" from "1)C" 
        thekey+=keydictionary[key_from_test.iat[i,0][-1:]]
    return thekey


def gradeWithKeylist(keylist, ans, numberOfQuestions, QIDs, points=[], analysis=False):
    '''multiple versions - find the correct key as indicated on the last question on the exam '''
    assert(keylist!=[])
    whichKey=int(ans[-1:])
    key=keylist[whichKey]
    assert len(key)==len(ans)
    missed=""
    #invkeydictionary={0:"A",1:"B",2:"C",3:"D",4:"E"}
    rejalt=[1]*numberOfQuestions
    for n in range(0,len(key)-1):
        if key[n]!=ans[n]:
            #print("storing")
            missed+=str(n+1)+", "
            rejalt[n]=0
    if sum(rejalt)==numberOfQuestions:
        missed="ALL CORRECT"
    else:
        missed="v"+invkeydictionary[whichKey]+": "+missed[:len(missed)-2]
    

    mydict1 = dict(zip(QIDs[whichKey],rejalt))
    sortedIDs=sorted(mydict1.keys())
    sorted_rejalt=[mydict1[k] for k in sortedIDs]####its sorted according to v0
    
    if not points:
        points=[1]*numberOfQuestions   
    points[-1]=0 #No points for the test version
    score=sum([i*j for i,j in zip(points,sorted_rejalt)])
    
    if analysis:
        global analysis_df     
        sorted_rejalt.append(score)
        analysis_df.loc[len(analysis_df)] = sorted_rejalt 
        
        global allAnswers_df
        mydict2 = dict(zip(QIDs[whichKey],ans))  
        sortedIDs=sorted(mydict2.keys())
        sorted_ans=[mydict2[k] for k in sortedIDs]
        allAnswers_df.loc[len(allAnswers_df)] = sorted_ans
    

        
    return {'missed':missed, 'score':score}

def process_grades(keylist,data,numberOfQuestions, QIDs, points=[], analysis=False):
    '''grades all exams using correct keys, writes questions missed and scores'''
    for NN in range(0, data.shape[0]):
        #print("NN: "+str(NN))
        check1=gradeWithKeylist(keylist, df.iat[NN,2], numberOfQuestions, QIDs, points, analysis)
        data.iat[NN,3]=check1['missed']
        data.iat[NN,4]=check1['score']
    if QIDs:
        global analysis_df
        analysis_df.drop(analysis_df.columns[analysis_df.shape[1]-2], axis=1, inplace=True)
      
    return data


def count_how_many(col, value):
    '''takes a pandas series "col" and counts the number of instances of a "value" 
    Not used since pandas has a built-in function'''
    
    count=0
    for n in range(0,col.shape[0]):
        if col[n]==value:
            count+=1
    return count


def addStarsToCorrectChoices(rdf, keylist, m, QIDS):
    '''adds stars to correct choices in rdf'''
    for n in range(0,rdf.shape[0]):
        if m!=0:
            mydict0 = dict(zip(QIDs[m],keylist[m])) 
            sortedkey=[mydict0[x] for x in QIDs[0]]
        else:
            sortedkey=keylist[m]
        the_correct_answer=sortedkey[n]
        rdf.iloc[n][the_correct_answer]=rdf.iloc[n][the_correct_answer]+"*"
    return rdf

def analyse_items(a_df, QIDs):
    '''create separate dfs with how many marked correct for each version separately'''
    vers=['0','1','2','3','4']
    lvers=['A','B','C','D','E']
    outvars=[]
    print("Versions found:")
    for n in range(0,len(vers)):
        ch=str(n)
        ##check the last column "which version..." to find the version and see if it matches ch
        part_df = a_df.loc[allAnswers_df[list(allAnswers_df)[-1]]==ch]
        if not part_df.empty:
            print(lvers[n])
            part_df=part_df.T
            part_df=part_df.apply(pd.Series.value_counts, axis=1).fillna(0)
            part_df=part_df.applymap(str)
            part_df=addStarsToCorrectChoices(part_df,keylist,n, QIDs)
            if ' ' in list(part_df):
                part_df.rename(columns={' ': 'blank'}, inplace=True)   
            for pp in vers:
                if pp in list(part_df):
                    part_df.rename(columns={pp: lvers[int(pp)]}, inplace=True)
            outvars.append(part_df)
        else:
            print(lvers[n]+" not found")
        
    return outvars


 ####################### SUGGESTED USAGE#################################
# outs = getAllKeys()
# keylist=outs["keylist"]
# numberOfQuestions=outs["numberOfQuestions"] 
# numberOfVersions=len(keylist)

# QIDs=createQuestionIDs(numberOfQuestions)
# analysis_df  = pd.DataFrame(columns = QIDs[0])
#count_how_many(allAnswers_df.iloc[12,:] , ' ')

Manually convert the dat file into an excel file using excel. Only extract the serial number,
name and Answers, and make sure Answers is text.
Do the following by hand:
check serial numbers (using remove duplicates)
manually fill in missing information 

In [5]:
#COLLECTING DATA
rawdatafilename='All'
xls_file = pd.ExcelFile(rawdatafilename+'.xlsx', dtype=str)
df = xls_file.parse('Sheet1', header=None, parse_cols=2,names = ["Srl No", "Name", "Answers"])
#parse_cols makes sure that only cols 0,1 and 2 are extracted
#checking for blanks, print only if blanks found
if not df[df['Answers'].str.contains(" ")].empty:
    print(df[df['Answers'].str.contains(" ")])
df["Missed"] = ""
df["Score"]=0

     Srl No               Name                     Answers
31     1034      PASCAL JOEZER  2041232433101420044 214422
192    2043  LAMBERT DOMINIC E  103020224314 1312213210120
253    2127  IKEDIONWU MUNACHI  103222121102013144032211 0


In [6]:
#here is where we run the code
keydictionary={"A":"0","B":"1","C":"2","D":"3","E":"4"}
invkeydictionary={0:"A",1:"B",2:"C",3:"D",4:"E"}
outs = getAllKeys()
keylist=outs["keylist"]
numberOfQuestions=outs["numberOfQuestions"] 
numberOfVersions=len(keylist)
numberOfStudents=df.shape[0]
QIDs=createQuestionIDs(numberOfQuestions)
headings=list(QIDs[0])#####headings=QIDs[0] assigns by reference, so changing headings will change QIDs[0]
allAnswers_df = pd.DataFrame(columns = headings)# stores all student answers for each question, questions are the columns
#results_df = pd.DataFrame(index=headings, columns = ["A","B","C","D","E","Skipped","Diff","Disc"])
headings.append("score")
analysis_df  = pd.DataFrame(columns = headings)# stores whether answer was correct, questions are the columns, last column is the score
starttime = time.time()
df=process_grades(keylist,df,numberOfQuestions, QIDs, analysis=True)
number_correct_by_version=analyse_items(allAnswers_df, QIDs)
endtime = time.time()
print("That took: "+str(endtime-starttime)+ " sec")

# results_df=allAnswers1_df.apply(pd.Series.value_counts, axis=1).fillna(0)
# if ' ' in list(results_df):
#     results_df.rename(columns={' ': 'blank'}, inplace=True)
# results_df=results_df.applymap(str)
# results_df=addStarsToCorrectChoices(results_df,keylist)
# results_df["Diff"]=0
# results_df["Discr"]=0

Versions found:
A
B
C
D not found
E not found
That took: 1.4002532958984375 sec


In [None]:
#Grades and displays the N'th entry in the list using a keylist
N=random.randint(1,df.shape[0]-1)

check1=gradeWithKeylist(keylist, df.iat[N,2],numberOfQuestions, QIDs, analysis=False)
print("Row "+ str(N)+": "+df.iat[N,1]+", "+str(df.iat[N,0])+". Missed "+str(check1['missed'])+ ". Scored "+str(check1['score'])+"/"+str(numberOfQuestions-1))

In [None]:
# writing to csv
df.to_csv(rawdatafilename+'_processed.csv')
analysis_df.to_csv(rawdatafilename+'_analysis.csv')
#### Don't opoen this file directly in excel, since the "Answers" column gets messed up. Import it using the import wizard. 

In [None]:
analysis_df

In [None]:
allAnswers_df

In [None]:
len(number_correct_by_version)

In [None]:
analysis_df.loc[len(analysis_df)]=analysis_df.sum(axis=0)
#analysis_df=analysis_df.drop(analysis_df.index[analysis_df.shape[0]-1])
#analysis_df=analysis_df.drop(analysis_df.index[[4,5]])

In [None]:
item_analysis_df=pd.DataFrame(columns = headings)

In [None]:
item_analysis_df.loc[0,:]=analysis_df.loc[analysis_df.shape[0]-1,:].div(numberOfStudents)

In [None]:
analysis_df

In [None]:
analysis_df.shape[0]-1

In [None]:
analysis_df=analysis_df.drop(analysis_df.index[4])

In [None]:
numberOfStudents

In [None]:
analysis_df.rename( index={'Difficulty':0}, inplace=True)

In [None]:
item_analysis_df.rename( index={0:'Difficulty'}, inplace=True)

In [None]:
item_analysis_df