In [1]:
# import libraries needed for cleaning
from pylab import *
import numpy as np
import pandas as pd
from scipy import stats
from datetime import datetime

# specify number of rows and columns to show
pd.set_option('display.max_rows', 20) 
pd.set_option('display.max_columns', 100)
# set default number format to 3 decimal places
pd.options.display.float_format = '{:40,.3f}'.format
# set ggplot style for plots
plt.style.use('ggplot') 
%matplotlib inline

In [2]:
# Read in the data
set1 = pd.read_csv('EveryoneActive_6.8.16_8.1.17.csv')
set2 = pd.read_csv('EveryoneActive_8.2.17_8.31.18.csv')

# Combine the two dataframes into a single dataset that runs from June 8, 2016 - August 31, 2018
frames = [set1, set2]
fullset = pd.concat(frames)

In [3]:
# Check counts and datatypes
fullset.count()

fullset.dtypes

MembershipID                  int64
SportsCentre                 object
VisitDate                    object
VisitTime                    object
Age                          object
Gender                       object
Ethnicity                    object
EthnicGroup                  object
Postcode                     object
ActivityType                 object
MembershipType               object
CitySaveActiveWestminster    object
FirstVisit                   object
dtype: object

In [4]:
# Convert VisitDate object type to datetime
fullset['TimeStamp'] = pd.to_datetime(fullset['VisitDate'] + ' ' + fullset['VisitTime'])

# Drop previous VisitDate and VisitTime columns
fullset = fullset.drop(['VisitDate', 'VisitTime'], axis=1)

# Re-order to column listing
fullset = fullset[['MembershipID','SportsCentre','TimeStamp','Age','Gender','Ethnicity','EthnicGroup','Postcode','ActivityType','MembershipType','CitySaveActiveWestminster','FirstVisit']]

# Convert Age to integer
fullset['Age'] = pd.to_numeric(fullset['Age'], errors ='coerce')

# Correct the name of the sport centres
fullset['SportsCentre'] = fullset['SportsCentre'].replace({'Moberley Sports Centre':'Moberly SC','Moberly Sports Centre' : 'Moberly SC', 'Queen Mother Sport Centre' : 'Queen Mother SC', 'Marshall Street L C' : 'Marshall Street LC','Seymour Leisure Centre' : 'Seymour LC', 'Jubilee S C London' : 'Jubilee SC', 'Jubilee Community Centre' : 'Jubilee SC', 'Little Venice Sports C' : 'Little Venice SC'})



In [5]:
# Create the new category of age bands
bins = [0, 15, 64, np.inf]
names = ['0-15', '16-64', '65+']

fullset['AgeBand'] = pd.cut(fullset['Age'], bins, labels=names)


# Fill missing AgeBands based on MembershipType category
d = {'Oxygen - Senior': '65+', 'Fitness - Senior':'65+', 'Centre - Senior':'65+', 'Oxygen - Adult':'16-64',
     'Fitness - Adult':'16-64', 'Centre - Adult':'16-64','Oxygen - Young Adult':'16-64', 'Fitness - Young Adult':'16-64',
     'Centre - Young Adult':'16-64','Adult':'16-64','Centre - Young Adult Conc':'16-64','Fitness - Junior':'0-15',
     'Oxygen - Junior':'0-15', 'Centre - Junior':'0-15', 'Centre - Jnr Concession':'0-15'}
s = fullset.MembershipType.map(d)
fullset.AgeBand = fullset.AgeBand.combine_first(s)


# Re-order the column list
fullset = fullset[['MembershipID','SportsCentre','TimeStamp','Age', 'AgeBand','Gender','Ethnicity','EthnicGroup','Postcode','ActivityType','MembershipType','CitySaveActiveWestminster','FirstVisit']]

In [6]:
# Consolidate Gender categories names
fullset['Gender'] = fullset['Gender'].replace({'Unspecified':'Not Recorded'})

# Consolidate the ethnicity categories
fullset['Ethnicity'] = fullset['Ethnicity'].replace({'Arabic':'Arab','Not willing to supply' : 'Unknown', 'White European Other' : 'White Other', 'Not Recorded' : 'Unknown','Other Mixed Background' : 'Mixed Other', 'Mixed Race' : 'Mixed Other', 'White & Asian' : 'White and Asian', 'Black Somali' : 'Black Other', 'White & Black Caribbean' : 'White and Black Caribbean', 'White& Black African' : 'White and Black African', 'White' : 'White British', 'Turkish' : 'Other', 'Brazilian' : 'Other', 'British' : 'White British', 'Whte British' : 'White British', 'ASIAN' : 'Asian Other', 'White Italian' : 'White Other'})

# Consolidate Ethnic Group names
fullset['EthnicGroup'] = fullset['EthnicGroup'].replace({'Not Recorded':'Unknown', 'Prefer not to say': 'Unknown'})




In [7]:
# Split dataframe in two for clean export
fullset1 = fullset.iloc[:700000, :]
fullset2 = fullset.iloc[700000:, :]

# Export cleaned dataframe to CSV for future use/analysis
#fullset1.to_csv('1EveryoneActive_CLEANED.csv', index=False)
#fullset2.to_csv('2EveryoneActive_CLEANED.csv', index=False)