In [25]:
import pandas as pd

def clean_unemployment_data(file_path):
    df = pd.read_csv(file_path)
    
    state_list = ['New South Wales', 'Victoria', 'Queensland', 'South Australia', 
                  'Western Australia', 'Tasmania', 'Northern Territory']
    
    selected_cols = ['Unnamed: 0']  #Date column
    for state in state_list:
        for col in df.columns:
            if f'Unemployment rate ;  Persons ;  > {state} ;' in col and '.1' not in col and '.2' not in col:
                selected_cols.append(col)
    
    df_selected = df[selected_cols]
    
    df_selected.columns = ['Date', 'NSW Unemployment Rate', 'VIC Unemployment Rate', 
                           'QLD Unemployment Rate', 'SA Unemployment Rate', 
                           'WA Unemployment Rate', 'TAS Unemployment Rate', 
                           'NT Unemployment Rate']

    # Remove rows with specific keywords
    rows_to_remove = ['Unit', 'Series Type', 'Data Type', 'Series Start', 
                      'Series End', 'No. Obs', 'Series ID']
    df_cleaned = df_selected[~df_selected['Date'].isin(rows_to_remove)]
    
    # Remove any duplicate or redundant rows
    df_cleaned = df_cleaned.drop_duplicates()

    return df_cleaned

file_path = 'data/unemploymentRateTimeSeries.csv' 
cleaned_data = clean_unemployment_data(file_path)

# save
cleaned_data.to_csv('data/cleaned_unemployment_data.csv', index=False)

print(cleaned_data.head())


        Date NSW Unemployment Rate VIC Unemployment Rate  \
7   Feb-1978                   6.6                   5.7   
8   Mar-1978                   6.5                   5.7   
9   Apr-1978                   6.4                   5.6   
10  May-1978                   6.3                   5.6   
11  Jun-1978                   6.3                   5.5   

   QLD Unemployment Rate SA Unemployment Rate WA Unemployment Rate  \
7                    7.1                  6.5                  6.2   
8                    7.1                  6.5                  6.2   
9                    7.1                  6.6                  6.3   
10                   7.1                  6.7                  6.4   
11                   7.1                  6.9                  6.5   

   TAS Unemployment Rate NT Unemployment Rate  
7                    6.3                  6.4  
8                    6.3                  6.1  
9                    6.4                  5.9  
10                   6.4  