# Cleaning the data

### Step 1: Importing in the necessary packages

In [None]:
import pandas as pd

### Step 2: Reading in the CSV we made in RStudio

In [None]:
csv_path = "data/BSA2020/bsa2020.csv"
BSA_2020 = pd.read_csv(csv_path, low_memory=False)

In [None]:
BSA_2020.head() # checking our data

In [None]:
BSA_2020.shape # checking the shape of the dataframe - it's all looking good

### Step 3: Subsetting
To be able to create a synthetic population of York, we only want data entries from Yorkshire and Humberside, and we do not need all of these columns, so we will subset for just what we are after.

In [None]:
# subsetting for just the Yorkshire and Humberside = indicated with by the value of GOR being 3
yorkshire_bsa = BSA_2020.loc[(BSA_2020.GOR == 3)]

In [None]:
yorkshire_bsa.shape # that's cut our sample down nicely

In [None]:
# subsetting to only include a fraction of the columns available
keep_columns = ['RespAgeE', # age a last birthday, capped at 80+
                'RespSx2cat', # respondants sex
                'REconSum20', # economic activity
                'SupParty', # do they support a party
                'PARTYFW', # which party do they follow
                'Politics', # do they have an interest in politics
                'welfgrp', # opinions on the welfare system
                'Redistrb', # pro wealth redistribution?
                'leftrigh', # left or right leaning (scaled)
                'leftrig2', # left or right leaning (grouped)
                'libauth', # liberal vs authoritarian (scaled)
                'libauth2', # liberal vs authoritarian (grouped)
                'ReligSum20', # do they follow a religion
                'BestNatU2', # what nationality best describes them
                'RaceOri4', # what race best describes them
                'DisActDV', # do they have a long-term condition or disability
                'Voted', # did they vote in the last general election
                'Vote', # who did they vote for in the last general election
               ]

In [None]:
# creating a function to subset columns
def subset_dataframe(dataframe, keep_columns):
    subset_bsa = pd.DataFrame() # initialises a new results dataframe
    for x in keep_columns: # for loop add columns to new dataframe
        subset_bsa[x] = dataframe[x]
        
    return subset_bsa # returns the new dataframe

In [None]:
subset_bsa = subset_dataframe(yorkshire_bsa, keep_columns)

In [None]:
subset_bsa.shape # we've subset down to a total of 18 columns!

Now we have a nice little subset, we'll export that as a CSV so we don't have to run any of this code again

In [None]:
subset_bsa.to_csv("data/BSA2020/subset4_regional_bsa_data.csv", index=False)
#hashtagged out as we have now run this and have no need to run it again!

### Step 4: Cleaning the new CSV
we don't want to have to have any NaN values so we will clean this dataset to ensure that we remove rows with missing entries so we have a complete dataset to work with.

In [None]:
# reading back in the CSV so we don't have to rerun all the steps above
csv_path_2 = "data/BSA2020/subset4_regional_bsa_data.csv"
subset_bsa_cleaning = pd.read_csv(csv_path_2, low_memory=False)

In [None]:
subset_bsa_cleaning.head() # from just checking the head we can already see columns with NaN values

In [None]:
bsa_cleaned = subset_bsa_cleaning.dropna() # this gives us a cleaned dataset with no NAs

In [None]:
# resetting the index of this data
bsa_cleaned.reset_index()

Next we'll rename some of the columns to more descriptive names just to help for analysis later on

In [None]:
bsa_cleaned.columns

In [None]:
new_names = {
    'RespAgeE': 'age', # age of respondant at last birthday (capped at 80)
    'RespSx2cat': 'sex', # sex of respondant
    'REconSum20': 'economic', # economic activity of respondent
    'SupParty': 'partySup', # does respondent support a particular party
    'PARTYFW': 'partySupWho', # which party do they support (if they had to choose on day of survey)
    'Politics': 'polInterest', # do they have an interest in politics
    'welfgrp': 'welfare', # opinions on the welfare system
    'Redistrb': 'redistrb', # pro wealth redistribution?
    'leftrigh': 'leftright', # left or right leaning (scaled)
    'leftrig2': 'leftright2', # left or right leaning (grouped)
    'libauth': 'libauth', # liberal vs authoritarian (scaled)
    'libauth2': 'libauth2', # liberal vs authoritarian (grouped)
    'ReligSum20': 'religion', # what religion do they follow
    'BestNatU2': 'nationality', # what nationality best describes them
    'RaceOri4': 'raceOrigin', # which racial group do they best identify with
    'DisActDV': 'disability', # do they have a long-term condition or disability
    'Voted': 'voteAct', # did they vote in the last general election
    'Vote': 'voteParty', # who did they vote for in the last general election
}

bsa_cleaned = bsa_cleaned.rename(columns=new_names)

In [None]:
bsa_cleaned.head() # looking beautiful and easy to interpret!

In [None]:
# adding an id column quickly!
bsa_cleaned.insert(0, 'id', range(1000, 1000 + len(bsa_cleaned)))
bsa_cleaned.head()

In [None]:
# dropping individuals who refused to answer what age they are
bsa_cleaned['age'].unique()
bsa_cleaned = bsa_cleaned.drop(bsa_cleaned[bsa_cleaned['age'] == 999].index)

In [None]:
bsa_cleaned['age'].unique()

In [None]:
bsa_cleaned.to_csv("data/BSA2020/final_bsa_cleaned.csv", index=False)