# Data preprocessing

In [77]:
import pandas as pd
from PAMI.extras.DF2DB import denseDF2DB as pro

# Read the air_pollution_data CSV file into a dataset
dataset = pd.read_csv('air_pollution_data.csv')

# Removing the timestamp column
dataset.drop(columns=['Timestamp'], inplace=True, axis=1)
# Fillup the missing values (NaN) with zero
dataset = dataset.fillna(0)
# Replacing the values greater than 100 to zero.
dataset.where(dataset <= 100, 0, inplace=True)
# Objective: convert the above dataframe into a transactional database with items whose value is greater than or equal 15.
db =  pro.denseDF2DB(inputDF=dataset, thresholdValue=15, condition = '>=')
# Convert and store the dataframe as a transactional database file
db.createTransactional(outputFile='PM24HeavyPollutionRecordingSensors.csv')

# read csv
csv_file_path = 'PM24HeavyPollutionRecordingSensors.csv'
df = pd.read_csv(csv_file_path, sep=',')
# Convert DataFrame to List
df_list = df.values.tolist()
# Create a new list and split each element using tab as separator
new_list = [row[0].split('\t') for row in df_list]
# Convert List to DataFrame
new_df = pd.DataFrame(new_list)
new_df.to_csv(csv_file_path, index=False)

# Knowledge discovery

In [78]:
from PAMI.frequentPattern.basic import FPGrowth as alg

inputFile = 'PM24HeavyPollutionRecordingSensors.csv'
#specify the constraints used in the model
minSup = 1
#create the object of the mining algorithm
obj = alg.FPGrowth(inputFile, minSup)
#start the mining process
obj.startMine()
#Save the generated patterns in a file
obj.save('frequentPatterns.txt')

Frequent patterns were generated successfully using frequentPatternGrowth algorithm


# Visualization

In [80]:
import re
import plotly.express as px

# Read the file to get a pattern
with open('frequentPatterns.txt', 'r') as file:
    patterns = file.readlines()

# Analyze the pattern to identify the longest pattern
max_pattern = []
max_length = 0
for pattern in patterns:
    # Obtain coordinates in the pattern
    coordinates = re.findall(r'Point\((.*?)\)', pattern)
    if len(coordinates) > max_length:
        max_pattern = coordinates
        max_length = len(coordinates)

# Display the longest pattern on Plotly Express Open Street Map
df = pd.DataFrame(max_pattern, columns=['coordinate'])
df[['Lon', 'Lat']] = df['coordinate'].str.split(' ', expand=True)
df['Lon'] = df['Lon'].astype(float)
df['Lat'] = df['Lat'].astype(float)

fig = px.scatter_mapbox(
    df, 
    lat='Lat', 
    lon='Lon', 
    hover_name='coordinate')
fig.update_layout(mapbox_style='open-street-map')
fig.show()