# INF-2600-1 24V Artificial Intelligence: Assignment 3 Pre-code
This code implements a Bayesian Network model for Analyzing Sensor Data for Weather Prediction using the pgmpy library in Python.

In [544]:
# Including the necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pgmpy.estimators import ParameterEstimator, MaximumLikelihoodEstimator
from pgmpy.models import BayesianNetwork
from pgmpy.inference import VariableElimination
from pgmpy.factors.discrete import TabularCPD
from pgmpy.estimators import MaximumLikelihoodEstimator

# Factoring the dataset

In [545]:
# Import data, make a copy of the original

df0 = pd.read_csv('precode/seattle-weather.csv')
dfc1 = df0.copy()
dfc1.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain


In [546]:
# Get characteristics of dataset including columns with missing data as well:
dfc1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           1461 non-null   object 
 1   precipitation  1461 non-null   float64
 2   temp_max       1461 non-null   float64
 3   temp_min       1461 non-null   float64
 4   wind           1461 non-null   float64
 5   weather        1461 non-null   object 
dtypes: float64(4), object(2)
memory usage: 68.6+ KB


In [547]:
# Checking the unique values in the 'weather' column
unique_fields = dfc1['weather'].unique()
print(unique_fields)

['drizzle' 'rain' 'sun' 'snow' 'fog']


In [548]:
dfc1.describe()

Unnamed: 0,precipitation,temp_max,temp_min,wind
count,1461.0,1461.0,1461.0,1461.0
mean,3.029432,16.439083,8.234771,3.241136
std,6.680194,7.349758,5.023004,1.437825
min,0.0,-1.6,-7.1,0.4
25%,0.0,10.6,4.4,2.2
50%,0.0,15.6,8.3,3.0
75%,2.8,22.2,12.2,4.0
max,55.9,35.6,18.3,9.5


In [549]:
# Put categorical varaibles in a list
categorical_lst = ['date','weather']
# Create a seperate & smaller dataframe for categorical variables
dfc2a = pd.DataFrame(dfc1, columns=categorical_lst, copy=True)
dfc2a.head()

Unnamed: 0,date,weather
0,2012-01-01,drizzle
1,2012-01-02,rain
2,2012-01-03,rain
3,2012-01-04,rain
4,2012-01-05,rain


In [550]:
# Put all continuous variables into a list
continuous_lst = ['precipitation', 'temp_max', 'temp_min', 'wind']
# Create a seperate & smaller dataframe for our chosen variables. Use 'copy=True' so changes wont affect original
dfc2b = pd.DataFrame(dfc1, columns=continuous_lst, copy=True)
dfc2b.head()

Unnamed: 0,precipitation,temp_max,temp_min,wind
0,0.0,12.8,5.0,4.7
1,10.9,10.6,2.8,4.5
2,0.8,11.7,7.2,2.3
3,20.3,12.2,5.6,4.7
4,1.3,8.9,2.8,6.1


### Create new dataframe

In [551]:
# Create new df with variables we want to work with:
new_cols = ['date', 'precipitation', 'temp_max', 'temp_min', 'wind', 'weather']

df = df0[new_cols]
# df.head()

In [552]:
# Let's show all columns with missing data as well:
df[df.isnull().any(axis=1)] # any missing data in columns
df.isnull().any()

date             False
precipitation    False
temp_max         False
temp_min         False
wind             False
weather          False
dtype: bool

In [553]:
num_stdv = 1

# Define the labels dictionary
labels = {

}

# Create bounds for continuous labels


df.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain


In [554]:
def discretize_column(df, column_name, bins, labels, restore=False, original_data=None):
    """
    Discretize or restore a specified column in a pandas DataFrame.
    
    Parameters:
    df (pandas.DataFrame): The DataFrame containing the column to discretize.
    column_name (str): The name of the column to discretize.
    bins (list of numbers): The bin edges for discretization.
    labels (list of str): The labels for the discrete bins.
    restore (bool): If True, restore the column to its original state.
    original_data (pandas.Series, optional): The original column data for restoration.
    
    Returns:
    pandas.DataFrame: The DataFrame with the discretized or restored column.
    pandas.Series (optional): The original column data if discretized for the first time.
    """
    
    if restore:
        if original_data is not None and column_name in df:
            df[column_name] = original_data
        else:
            raise ValueError("Original data is not provided for restoration.")
    else:
        if column_name in df:
            original_data = df[column_name].copy()
            df[column_name] = pd.cut(df[column_name], bins=bins, labels=labels, include_lowest=True)
        else:
            raise ValueError(f"Column {column_name} does not exist in the DataFrame.")
    
    return df, original_data

# Creating Bayes Nets

In [555]:
# Define the hierarchy
weather_model = BayesianNetwork([
    ('weather', 'precipitation'),
    ('weather', 'wind'),
    ('precipitation', 'temp_max'),
    ('wind', 'temp_max'),
    ('wind', 'temp_min')
])

estimator = MaximumLikelihoodEstimator(weather_model, df)
cpds = estimator.get_parameters()
weather_model.add_cpds(*cpds)

assert weather_model.check_model(), "This model is inconsistent"

#for cpd in weather_model.get_cpds():
#    print(cpd)
    
inference = VariableElimination(weather_model)


df, original_precipitation = discretize_column(
    df=df,
    column_name='wind',
    bins=[-float('inf'), 5, 15, float('inf')],
    labels=['low', 'medium', 'high']
)

try:
    result = inference.query(variables=['weather'],evidence={'precipitation': 'high'})
    print(result)
except KeyError as e:
    print(f'key error: {e}, no bueno key')
finally:
    df, _ = discretize_column(
     df=df,
     column_name='wind',
     bins=None,
     labels=None,
     restore=True,
     original_data=original_precipitation
    )
    

key error: 'high', no bueno key


In [556]:
import pandas as pd
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination

# Assume df is already loaded and structured as mentioned previously

# Discretization details
bins = [-float('inf'), 3, 5, float('inf')]
labels = ['low', 'medium', 'high']

# Apply discretization
df['wind'] = pd.cut(df['wind'], bins=bins, labels=labels, include_lowest=True)
print("Discretized wind column categories:", df['wind'].cat.categories)
print("Counts per category:", df['wind'].value_counts())

# Define the Bayesian Network structure
weather_model = BayesianNetwork([
    ('weather', 'precipitation'),
    ('weather', 'wind'),
    ('precipitation', 'temp_max'),
    ('wind', 'temp_max'),
    ('wind', 'temp_min')
])

# Estimate parameters
estimator = MaximumLikelihoodEstimator(weather_model, df)
cpds = estimator.get_parameters()
weather_model.add_cpds(*cpds)

# Check model consistency
if not weather_model.check_model():
    raise ValueError("The model is inconsistent")

# Print CPDs to check for 'high' presence
for cpd in weather_model.get_cpds():
    print(cpd)

# Setup for inference
inference = VariableElimination(weather_model)

# Perform the query
try:
    # Query using the 'high' category if available
    result = inference.query(variables=['weather'], evidence={'wind': 'high'})
    print(result)
except KeyError as e:
    print(f"Key error: {e}. It appears 'high' is not a valid category in the network CPDs.")


Discretized wind column categories: Index(['low', 'medium', 'high'], dtype='object')
Counts per category: wind
low       788
medium    499
high      174
Name: count, dtype: int64
+------------------+-----------+
| weather(drizzle) | 0.0362765 |
+------------------+-----------+
| weather(fog)     | 0.0691307 |
+------------------+-----------+
| weather(rain)    | 0.438741  |
+------------------+-----------+
| weather(snow)    | 0.017796  |
+------------------+-----------+
| weather(sun)     | 0.438056  |
+------------------+-----------+
+---------------------+-----+--------------+
| weather             | ... | weather(sun) |
+---------------------+-----+--------------+
| precipitation(0.0)  | ... | 1.0          |
+---------------------+-----+--------------+
| precipitation(0.3)  | ... | 0.0          |
+---------------------+-----+--------------+
| precipitation(0.5)  | ... | 0.0          |
+---------------------+-----+--------------+
| precipitation(0.8)  | ... | 0.0          |
+-------

In [557]:
# Creating tabular conditional probability distribution



In [558]:
# Add CPDs and factors to the model


# Check if model is consistent


In [559]:
# Viewing nodes of the model
weather_model.nodes()

NodeView(('weather', 'precipitation', 'wind', 'temp_max', 'temp_min'))

In [560]:
# Viewing edges of the model
weather_model.edges()

OutEdgeView([('weather', 'precipitation'), ('weather', 'wind'), ('precipitation', 'temp_max'), ('wind', 'temp_max'), ('wind', 'temp_min')])

In [561]:
# Independcies in the model

# Checking independcies of a particular node


# Task 1.2

In [562]:
from pgmpy.inference import VariableElimination

In [563]:
# Question 1: (a) What is the probability of high wind when the weather is sunny? (b) What is the probability of sunny weather when the wind is high?
result_high_wind_sunny = inference.query(variables=['wind'], evidence={'weather': 'sun'})

df, original_wind = discretize_column(
    df=df,
    column_name='wind',
    bins=[-float('inf'), 5, 15, float('inf')],
    labels=['low', 'medium', 'high']
)

df.head()


result_sunny_high_wind = inference.query(variables=['weather'], evidence={'wind': 'high'})
#  
# df, _ = discretize_column(
#     df=df,
#     column_name='wind',
#     bins=None,
#     labels=None,
#     restore=True,
#     original_data=original_wind
# )
# 
# print(result_sunny_high_wind)


TypeError: '<' not supported between instances of 'float' and 'str'

In [None]:
# Question 2:
# (a) Calculate all the possible joint probability and determine the best probable condition. Explain your results?

# (b) What is the most probable condition for precipitation, wind and weather, combined?


In [564]:
# Question 3. Find the probability associated with each weather, given that the precipitation is medium? Explain your result.



In [565]:
# Question 4. What is the probability of each weather condition given that precipitation is medium and wind is low or medium? Explain your method and results. How does the result change with the addition of wind factor compared to question 3 of Task 1.2?



# Task 1.3 - Approximate Inference

## Likelihood Weighted Sample

Generates weighted sample(s) from joint distribution of the Bayesian Network, that comply with the given evidence.

In [566]:
from pgmpy.factors.discrete import State
from pgmpy.sampling import BayesianModelSampling

In [567]:
# Repeat Q.1. (a) of Task 1.2 - What is the probability of high wind when the weather is sunny?



In [568]:
# Repeat Q.1. (b) of Task 1.2 - What is the probability of sunny weather when the wind is high?



## Rejection Sampling

In [569]:
# Repeat Q.2 . (a) of Task 1.2 - Calculate all the possible joint probability and determine the best probable condition. Explain your results?



In [570]:
# Repeat Q.2 . (b) of Task 1.2 - What is the most probable condition for precipitation, wind and weather, combined?



## Approx Inference

In [571]:
from pgmpy.inference import ApproxInference

In [572]:
# Repeat Q.3 of Task 1.2 - Find the probability associated with each weather, given that the precipitation is medium? Explain your result.



# Normal Sampling

In [573]:
# Repeat Q.4 of Task 1.2 - What is the probability of each weather condition given that precipitation is medium and wind is low or medium? Explain your method and results. How does the result change with the addition of wind factor compared to question 3 of Task 1.2?

