In [1]:
import os
import pandas as pd

from processing.box_whisker_graphs import BoxWhiskerGraph, MeltDataframe
from processing.clean_data import CleanData
from processing.descriptive_stats import DescriptiveStats
from processing.dispersion_graphs import DispersionGraph
from processing.heatmap import Heatmap
from processing.line_graphs import LineGraphs
from processing.scatter_graphs import ScatterGraph
from processing.time_split import TimeSplit

In [2]:
CURRENT_PATH = os.getcwd()
# change the level depending on where the data is stored
three_levels_up = os.path.abspath(os.path.join(CURRENT_PATH, '..', '..'))


Variables to be adapted based on date coverage, time groups required, city,  and output path 

In [3]:
# in my file system, I have saved each csv using the following format: "data_004_020924_090924.csv" - the date is the same as the date_coverage_for_data_path and 004 is one of the device IDs

date_coverage_for_data_path = '020924_090924'
prefixes = ['004', '008', '011', '015', '024', '029', '036', '041', '044', '071']

# the number of groups per day relates to whether you want to split the timings up. 4 groups will result in the following timings: 00:00-06:00, 06:00-12:00, 12:00-18:00 and 18:00-24:00. 
number_of_groups_per_day = 8

# adapt to the route to your directory
DATA_PATH = f'{three_levels_up}/Data/Gurugram'
OUTPUT_PATH = f'{three_levels_up}/Coding/outputs/one_week_ten_sites_8_groups_Gurugram_test'
# this is additional information to add to the name regarding the location and the number of sites
locations_text = f'{len(prefixes)}_gurugram'

# prefixes = ['012', '017', '018', '030', '032', '047', '061', '062', '073', '086']
# number_of_groups_per_day = 8
# DATA_PATH = f'{four_levels_up}/Data/Patna'
# OUTPUT_PATH = f'{four_levels_up}/Coding/outputs/one_week_ten_sites_8_groups_Patna_ppt'
# locations_text = f'{len(prefixes)}_patna'

# The current list of static ids for the two cities are summarised below (as of 13/09/24). These can be used for the prefixes if you have downloaded all the data.
# Gurugram = ['004', '007', '008', '011', '014', '015', '020', '024', '029', '033', '036', '037', '038', '041', '044', '046', '051', '054', '055', '059', '071', '075', '094', '096']
# Patna = ['012', '013', '016', '017', '018', '019', '021', '022', '025', '026', '027', '028', '030', '031', '032', '035', '043', '047', '048', '049', '052', '061', '062', '063', '064', '073', '077', '086', '099']

The below is only to be adapted if you do not want to include all the variables or to add new variables/time groups

In [4]:
dataframes = []

for prefix in prefixes:
    data_file = f'{DATA_PATH}/data_{prefix}_{date_coverage_for_data_path}.csv'
    dataframe = pd.read_csv(data_file)
    sorted_dataframe = dataframe.sort_values(by='data_created_time')
    dataframes.append(sorted_dataframe)

column_containing_dates = 'data_created_time'
variables_list = ['pm_25', 'pm_10', 'no2', 'co', 'temp', 'rh']

locations_with_pm_25 = [s + '_pm_25' for s in prefixes]
locations_with_pm_10 = [s + '_pm_10' for s in prefixes]
locations_with_no2 = [s + '_no2' for s in prefixes]
locations_with_co = [s + '_co' for s in prefixes]
locations_with_temp = [s + '_temp' for s in prefixes]
locations_with_rh = [s + '_rh' for s in prefixes]

locations_with_pollutant = [locations_with_pm_25, locations_with_pm_10, locations_with_no2, locations_with_co,
                            locations_with_temp, locations_with_rh]

time_groups = ['day_of_week', 'group_time', 'hour']
descriptive_stats_groups = ['mean', 'max', 'min', 'range', 'iqr', 'std']
key_time_date_column = 'data_created_time'

The below cleans each dataframe, gets the overall descriptive stats and splits the data according to the time split you applied earlier.

In [5]:
time_split_multiple = []
clean_dataframes = []
overall_descriptive_stats = []
for dataframe in dataframes:
    clean_dataframe = CleanData(dataframe=dataframe).remove_anomalies()
    clean_dataframes.append(clean_dataframe)
    overall_descriptive_stats.append(
        DescriptiveStats([clean_dataframe], variables_list, output_directory=OUTPUT_PATH).get_stats())

    time_split_multiple.append(
        TimeSplit(clean_dataframe, column_containing_dates, number_of_groups_per_day).split_dataframe())

Get scatter plots involving all the locations you have included. The variables here are the ones found to have the highest correlations, however, ['pm_25', 'pm_10', 'temp', 'rh'] could be replaced with variables_list.

In [6]:
combined_dataframe = pd.concat(clean_dataframes, ignore_index=True)
ScatterGraph(combined_dataframe, ['pm_25', 'pm_10', 'temp', 'rh'], output_directory=OUTPUT_PATH, combined=True).scatter_graph()

Unnamed: 0,index,x,y,r2
5,5,temp,rh,0.853351
0,0,pm_25,pm_10,0.749033
2,2,pm_25,rh,0.048221
1,1,pm_25,temp,0.003716
3,3,pm_10,temp,0.003483
4,4,pm_10,rh,0.00191


Get the descriptive stats dataframes for key timings

In [7]:
descriptive_stats_dataframes_timings = []
for timing in ['group_time', 'hour', 'date']:
    descriptive_stats_dataframes_timings.append(DescriptiveStats(time_split_multiple, variables_list, time_group=timing,output_directory=OUTPUT_PATH).get_stats())

Get the prefixed dataframes and scatter plots

In [8]:
prefixed_dataframes = []
for clean_dataframe, prefix in zip(clean_dataframes, prefixes):
    dataframe_prefixed = clean_dataframe.add_prefix(f'{prefix}_')
    prefixed_dataframes.append(dataframe_prefixed.rename(columns={f'{prefix}_{key_time_date_column}': f'{
        key_time_date_column}'}))
    ScatterGraph(clean_dataframe, variables_list,
                 output_directory=OUTPUT_PATH).scatter_graph()

Merge all the clean dataframes and sort by time and update it to include the time split

In [9]:
merged_dataframe = prefixed_dataframes[0]
# Iterate and merge
for dataframe in prefixed_dataframes[1:]:
    merged_dataframe = merged_dataframe.merge(dataframe, on=key_time_date_column, how='outer')

merged_dataframe_sorted = merged_dataframe.sort_values(by=key_time_date_column)

updated_merged_dataframe = TimeSplit(merged_dataframe_sorted, column_containing_dates,
                                     number_of_groups_per_day).split_dataframe()


Get line and box plots for each location and pollutant overall and for the breakdown by time group

In [10]:
for location_with_pollutant, prefix in zip(locations_with_pollutant, prefixes):
    LineGraphs(merged_dataframe_sorted, 
               location_with_pollutant, 
               'data_created_time',
               locations=f'{locations_text}_{prefix}',
               output_directory=OUTPUT_PATH).line_plot()

    focused_dataframe = updated_merged_dataframe[location_with_pollutant]

    value = location_with_pollutant[0][4:]

    df_melted = focused_dataframe.melt(var_name='Device', value_name=value)

    df_melted['Device'] = df_melted['Device'].str.replace(f'_{value}', '')

    BoxWhiskerGraph(df_melted, 
                    value, 
                    'Device', 
                    locations=f'{locations_text}_{value}',
                    output_directory=OUTPUT_PATH).box_whisker_graph()

    for x_column in time_groups:      
        melted_dataframe = MeltDataframe(updated_merged_dataframe, 
                                         'locations', 
                                         value, 
                                         x_column,
                                         location_with_pollutant,
                                         location_more_than_one=True).get_melted_dataframe()
        
        melted_dataframe['locations'] = melted_dataframe['locations'].str.replace(f'_{value}', '')

        BoxWhiskerGraph(melted_dataframe, 
                        value, 
                        x_column, 
                        multiple='locations',
                        locations=locations_text,
                        output_directory=OUTPUT_PATH).box_whisker_graph()

In [11]:
for dataframe in clean_dataframes:
    LineGraphs(dataframe, 
               ['pm_25', 'pm_10', 'no2'], 
               'data_created_time', 
               output_directory=OUTPUT_PATH).line_plot()
    
    for variable in variables_list:
        for group in time_groups:
            BoxWhiskerGraph(dataframe, 
                            variable, 
                            group, 
                            output_directory=OUTPUT_PATH).box_whisker_graph()


Get heatmaps and dispersion plots across sites and for different timings

In [12]:
updated_descriptive_stats_dataframes_timings = []
for variable in variables_list:
    for group_dataframe in descriptive_stats_dataframes_timings:
        updated_descriptive_stats_dataframes_timings.append(group_dataframe[group_dataframe['variable'] == variable])
    
        for dataframe, group in zip(updated_descriptive_stats_dataframes_timings, ['group_time', 'hour', 'date']):
            for descriptive_stat in descriptive_stats_groups:
                DispersionGraph(dataframe, 
                                [descriptive_stat], 
                                group, locations_stat_variable=f'{locations_text}_{descriptive_stat}_{variable}',
                                output_directory=OUTPUT_PATH).dispersion_graph()
                
                Heatmap(dataframe, 
                        descriptive_stat, 
                        group, 
                        'device_id', locations=f'{locations_text}_{descriptive_stat}_{variable}', output_directory=OUTPUT_PATH).heatmap_plot()
