# Code to Clean Messy Exam Data

This python code cleans the messy exam data file and retruns a tidy data set to Excel for analysis.

In [8]:
# Import required packages
import pandas as pd

# Read in messy_data example
messy_data = pd.read_excel('Messy Data Example - Collated Exam Results.xlsx')

In [9]:
# Melt subjects (in columns) in to a single column
messy_data = pd.melt(messy_data, id_vars=['Person'], var_name='Subject')

# Output head of data to screen
messy_data.head()

Unnamed: 0,Person,Subject,value
0,PERSON-001,English,82/100 (82.0%)
1,PERSON-002,English,96/100 (96.0%)
2,PERSON-003,English,95/100 (95.0%)
3,PERSON-004,English,36/100 (36.0%)
4,PERSON-005,English,59/100 (59.0%)


In [10]:
# Manipulate text string to remove all unwanted characters and implement common value delimiter '/'
messy_data['value'] = messy_data['value'].str.replace('\(|\)','',regex=True)
messy_data['value'] = messy_data['value'].str.replace('%','')
messy_data['value'] = messy_data['value'].str.replace(' ','/')

# Split columns based on delimiter and assign to new variables in messy_data
messy_data[['Score','Total Marks Available','% Performance']] = messy_data['value'].str.split('/',expand=True)

# Drop value column form data frame
messy_data.drop(['value'],axis = 1,inplace=True)

# Output head of data to screen
messy_data.head()

Unnamed: 0,Person,Subject,Score,Total Marks Available,% Performance
0,PERSON-001,English,82,100,82.0
1,PERSON-002,English,96,100,96.0
2,PERSON-003,English,95,100,95.0
3,PERSON-004,English,36,100,36.0
4,PERSON-005,English,59,100,59.0


In [11]:
# Export clean data to Excel
messy_data.to_excel('Tidy Data Example - Collated Exam Results.xlsx',index=False)