# INF-2600-1 24V Artificial Intelligence: Assignment 3 Pre-code
This code implements a Bayesian Network model for Analyzing Sensor Data for Weather Prediction using the pgmpy library in Python.

In [107]:
# Including the necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pgmpy.estimators import ParameterEstimator, MaximumLikelihoodEstimator
from pgmpy.models import BayesianNetwork
from pgmpy.inference import VariableElimination
from pgmpy.factors.discrete import TabularCPD
from pgmpy.estimators import MaximumLikelihoodEstimator

# Factoring the dataset

In [108]:
# Import data, make a copy of the original

df0 = pd.read_csv('precode/seattle-weather.csv')
dfc1 = df0.copy()
dfc1.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain


In [109]:
# Get characteristics of dataset including columns with missing data as well:
dfc1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           1461 non-null   object 
 1   precipitation  1461 non-null   float64
 2   temp_max       1461 non-null   float64
 3   temp_min       1461 non-null   float64
 4   wind           1461 non-null   float64
 5   weather        1461 non-null   object 
dtypes: float64(4), object(2)
memory usage: 68.6+ KB


In [110]:
# Checking the unique values in the 'weather' column
unique_fields = dfc1['weather'].unique()
print(unique_fields)

['drizzle' 'rain' 'sun' 'snow' 'fog']


In [111]:
dfc1.describe()

Unnamed: 0,precipitation,temp_max,temp_min,wind
count,1461.0,1461.0,1461.0,1461.0
mean,3.029432,16.439083,8.234771,3.241136
std,6.680194,7.349758,5.023004,1.437825
min,0.0,-1.6,-7.1,0.4
25%,0.0,10.6,4.4,2.2
50%,0.0,15.6,8.3,3.0
75%,2.8,22.2,12.2,4.0
max,55.9,35.6,18.3,9.5


In [112]:
# Put categorical varaibles in a list
categorical_lst = ['date','weather']
# Create a seperate & smaller dataframe for categorical variables
dfc2a = pd.DataFrame(dfc1, columns=categorical_lst, copy=True)
dfc2a.head()

Unnamed: 0,date,weather
0,2012-01-01,drizzle
1,2012-01-02,rain
2,2012-01-03,rain
3,2012-01-04,rain
4,2012-01-05,rain


In [113]:
# Put all continuous variables into a list
continuous_lst = ['precipitation', 'temp_max', 'temp_min', 'wind']
# Create a seperate & smaller dataframe for our chosen variables. Use 'copy=True' so changes wont affect original
dfc2b = pd.DataFrame(dfc1, columns=continuous_lst, copy=True)
dfc2b.head()

Unnamed: 0,precipitation,temp_max,temp_min,wind
0,0.0,12.8,5.0,4.7
1,10.9,10.6,2.8,4.5
2,0.8,11.7,7.2,2.3
3,20.3,12.2,5.6,4.7
4,1.3,8.9,2.8,6.1


### Create new dataframe

In [114]:
# Create new df with variables we want to work with:
new_cols = ['date', 'precipitation', 'temp_max', 'temp_min', 'wind', 'weather']

df = df0[new_cols]
# df.head()

In [115]:
# Let's show all columns with missing data as well:
df[df.isnull().any(axis=1)] # any missing data in columns
df.isnull().any()

date             False
precipitation    False
temp_max         False
temp_min         False
wind             False
weather          False
dtype: bool

# Creating Bayes Nets

In [116]:
import pandas as pd
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination

# Discretization details
wind_bins = [-float('inf'), 3, 5, float('inf')]
labels = ['low', 'medium', 'high']

# Apply discretization
original_wind = df['wind'].copy()
df['wind'] = pd.cut(df['wind'], bins=wind_bins, labels=labels, include_lowest=True)

print(df.head())
# Define the Bayesian Network structure
weather_model = BayesianNetwork([
    ('weather', 'precipitation'),
    ('weather', 'wind'),
    ('precipitation', 'temp_max'),
    ('wind', 'temp_max'),
    ('wind', 'temp_min')
])

# Estimate parameters
estimator = MaximumLikelihoodEstimator(weather_model, df)
cpds = estimator.get_parameters()
weather_model.add_cpds(*cpds)

# Check model consistency
if not weather_model.check_model():
    raise ValueError("The model is inconsistent")

# Print CPDs to check for 'high' presence
#for cpd in weather_model.get_cpds():
#    print(cpd)

inference = VariableElimination(weather_model)

try:
    # Query using the 'high' category if available
    result = inference.query(variables=['weather'], evidence={'wind': 'high'})
    print(result)
except KeyError as e:
    print(f"Key error: {e}. It appears 'high' is not a valid category in the network CPDs.")
finally:
    df['wind'] = original_wind


         date  precipitation  temp_max  temp_min    wind  weather
0  2012-01-01            0.0      12.8       5.0  medium  drizzle
1  2012-01-02           10.9      10.6       2.8  medium     rain
2  2012-01-03            0.8      11.7       7.2     low     rain
3  2012-01-04           20.3      12.2       5.6  medium     rain
4  2012-01-05            1.3       8.9       2.8    high     rain
+------------------+----------------+
| weather          |   phi(weather) |
| weather(drizzle) |         0.0000 |
+------------------+----------------+
| weather(fog)     |         0.0287 |
+------------------+----------------+
| weather(rain)    |         0.6897 |
+------------------+----------------+
| weather(snow)    |         0.0632 |
+------------------+----------------+
| weather(sun)     |         0.2184 |
+------------------+----------------+


In [117]:

# Discretize columns
precipitation_bins = [0, 5, 11, float('inf')]
labels = ['low', 'medium', 'high']

# Apply discretization
original_precipitation = df['precipitation'].copy()
df['precipitation'] = pd.cut(df['precipitation'], bins=precipitation_bins, labels=labels, include_lowest=True)

weather_model = BayesianNetwork([
    ('weather', 'precipitation'),
    ('weather', 'wind'),
    ('precipitation', 'temp_max'),
    ('wind', 'temp_max'),
    ('wind', 'temp_min')
])

# Estimate parameters
estimator = MaximumLikelihoodEstimator(weather_model, df)
cpds = estimator.get_parameters()
weather_model.add_cpds(*cpds)

# Check model consistency
if not weather_model.check_model():
    raise ValueError("The model is inconsistent")

# Print CPDs to check for 'high' presence
#for cpd in weather_model.get_cpds():
#    print(cpd)

# Setup for inference
inference = VariableElimination(weather_model)

# Perform the query
try:
    # Query using the 'high' category if available
    result = inference.query(variables=['weather'], evidence={'wind': 'high'})
    print(result)
except KeyError as e:
    print(f"Key error: {e}. It appears 'high' is not a valid category in the network CPDs.")
finally:
    df['precipitation'] = original_precipitation
    
    print(df.head())

Key error: 'high'. It appears 'high' is not a valid category in the network CPDs.
         date  precipitation  temp_max  temp_min  wind  weather
0  2012-01-01            0.0      12.8       5.0   4.7  drizzle
1  2012-01-02           10.9      10.6       2.8   4.5     rain
2  2012-01-03            0.8      11.7       7.2   2.3     rain
3  2012-01-04           20.3      12.2       5.6   4.7     rain
4  2012-01-05            1.3       8.9       2.8   6.1     rain


In [118]:
# Creating tabular conditional probability distribution



In [119]:
# Add CPDs and factors to the model


# Check if model is consistent


In [120]:
# Viewing nodes of the model
weather_model.nodes()

NodeView(('weather', 'precipitation', 'wind', 'temp_max', 'temp_min'))

In [121]:
# Viewing edges of the model
weather_model.edges()

OutEdgeView([('weather', 'precipitation'), ('weather', 'wind'), ('precipitation', 'temp_max'), ('wind', 'temp_max'), ('wind', 'temp_min')])

In [122]:
# Independcies in the model

# Checking independcies of a particular node


# Task 1.2

In [123]:
from pgmpy.inference import VariableElimination

In [124]:
# Question 1: (a) What is the probability of high wind when the weather is sunny? (b) What is the probability of sunny weather when the wind is high?
# (a) Probability of high wind when the weather is sunny
prob_high_wind_given_sunny = inference.query(variables=['wind'], evidence={'weather': 'sun'})
print("Probability of high wind given sunny weather:")
print(prob_high_wind_given_sunny)

# (b) Probability of sunny weather when the wind is high
df['wind'] = pd.cut(df['wind'], bins=wind_bins, labels=labels, include_lowest=True)
print(df)

prob_sunny_given_high_wind = inference.query(variables=['weather'], evidence={'wind': 'high'})
print("Probability of sunny weather given high wind:")
print(prob_sunny_given_high_wind['weather'])
df['wind'] = original_wind



Probability of high wind given sunny weather:
+-----------+-------------+
| wind      |   phi(wind) |
| wind(0.4) |      0.0016 |
+-----------+-------------+
| wind(0.5) |      0.0031 |
+-----------+-------------+
| wind(0.6) |      0.0031 |
+-----------+-------------+
| wind(0.7) |      0.0016 |
+-----------+-------------+
| wind(0.8) |      0.0047 |
+-----------+-------------+
| wind(0.9) |      0.0031 |
+-----------+-------------+
| wind(1.0) |      0.0063 |
+-----------+-------------+
| wind(1.1) |      0.0047 |
+-----------+-------------+
| wind(1.2) |      0.0078 |
+-----------+-------------+
| wind(1.3) |      0.0203 |
+-----------+-------------+
| wind(1.4) |      0.0094 |
+-----------+-------------+
| wind(1.5) |      0.0156 |
+-----------+-------------+
| wind(1.6) |      0.0141 |
+-----------+-------------+
| wind(1.7) |      0.0234 |
+-----------+-------------+
| wind(1.8) |      0.0219 |
+-----------+-------------+
| wind(1.9) |      0.0281 |
+-----------+-------------+
| 

KeyError: 'high'

In [125]:
# Question 2:
# (a) Calculate all the possible joint probability and determine the best probable condition. Explain your results?
joint_prob = inference.query(variables=['weather', 'precipitation', 'wind', 'temp_max', 'temp_min'], joint=True)
# print(joint_prob) # wont work, output to big

# (b) What is the most probable condition for precipitation, wind and weather, combined?
print(joint_prob.maximize([]))

None


In [126]:
# Question 3. Find the probability associated with each weather, given that the precipitation is medium? Explain your result.
result_q3 = inference.query(variables=['weather'], evidence={'precipitation': 'medium'})
print(result_q3)



+------------------+----------------+
| weather          |   phi(weather) |
| weather(drizzle) |         0.0000 |
+------------------+----------------+
| weather(fog)     |         0.0000 |
+------------------+----------------+
| weather(rain)    |         0.9389 |
+------------------+----------------+
| weather(snow)    |         0.0611 |
+------------------+----------------+
| weather(sun)     |         0.0000 |
+------------------+----------------+


In [127]:
# Query the network for weather conditions with 'precipitation = medium' and 'wind = low'
df['precipitation'] = pd.cut(df['precipitation'], bins=precipitation_bins, labels=labels, include_lowest=True)
df['wind'] = pd.cut(df['wind'], bins=wind_bins, labels=labels, include_lowest=True)

try:
    # Query for 'precipitation' = 'medium'
    weather_given_precip_med = inference.query(
        variables=['weather'],
        evidence={'precipitation': 'medium'}
    )
    print(weather_given_precip_med)
finally:
    df['precipitation'] = original_precipitation
    
try:
    # Query for 'wind' = 'low'
    weather_given_wind_low = inference.query(
        variables=['weather'],
        evidence={'wind': 'low'}
    )
    print(weather_given_wind_low)
except Exception as e:
    print(f'failed: {e}')
finally:
    df['wind'] = original_wind

TypeError: '<' not supported between instances of 'float' and 'str'

# Task 1.3 - Approximate Inference

## Likelihood Weighted Sample

Generates weighted sample(s) from joint distribution of the Bayesian Network, that comply with the given evidence.

In [None]:
from pgmpy.factors.discrete import State
from pgmpy.sampling import BayesianModelSampling


In [None]:
# Repeat Q.1. (a) of Task 1.2 - What is the probability of high wind when the weather is sunny?
# (a) Likelihood Weighted Sampling for high wind given sunny weather
sampler = BayesianModelSampling(weather_model)

#lw_samples = sampler.likelihood_weighted_sample(evidence=[State('weather', 'sun')], size=1000)
#high_wind_given_sunny = lw_samples[lw_samples['wind'] == 'high'].shape[0] / lw_samples.shape[0]
#print("P(High Wind | Sunny) using Likelihood Weighted Sampling:", high_wind_given_sunny)


In [128]:
# Repeat Q.1. (b) of Task 1.2 - What is the probability of sunny weather when the wind is high?
sampler = BayesianModelSampling(weather_model)
df['wind'] = pd.cut(df['wind'], bins=wind_bins, labels=labels, include_lowest=True)


# Probability of high wind when the weather is sunny
lw_samples = sampler.likelihood_weighted_sample(evidence=[State('weather', 'sun')], size=1000)
high_wind_given_sunny = lw_samples[lw_samples['wind'] == 'high'].shape[0] / lw_samples.shape[0]
print("P(High Wind | Sunny) using Likelihood Weighted Sampling:", high_wind_given_sunny)



TypeError: '<' not supported between instances of 'float' and 'str'

## Rejection Sampling

In [129]:
# Repeat Q.2 . (a) of Task 1.2 - Calculate all the possible joint probability and determine the best probable condition. Explain your results?



In [130]:
# Repeat Q.2 . (b) of Task 1.2 - What is the most probable condition for precipitation, wind and weather, combined?



## Approx Inference

In [131]:
from pgmpy.inference import ApproxInference

In [132]:
# Repeat Q.3 of Task 1.2 - Find the probability associated with each weather, given that the precipitation is medium? Explain your result.



# Normal Sampling

In [133]:
# Repeat Q.4 of Task 1.2 - What is the probability of each weather condition given that precipitation is medium and wind is low or medium? Explain your method and results. How does the result change with the addition of wind factor compared to question 3 of Task 1.2?



# Other hierarchies


In [134]:
weather_model_h1 = BayesianNetwork([
    ('weather', 'precipitation'),
    ('weather', 'wind'),
    ('precipitation', 'temp_max'),
    ('precipitation', 'temp_min'),
    ('wind', 'temp_min'),
    ('wind', 'temp_max')
])

weather_model_h2 = BayesianNetwork([
    ('weather', 'wind'),
    ('wind', 'precipitation'),
    ('precipitation', 'temp_max'),
    ('precipitation', 'temp_min')
])

