# INF-2600-1 24V Artificial Intelligence: Assignment 3 Pre-code
This code implements a Bayesian Network model for Analyzing Sensor Data for Weather Prediction using the pgmpy library in Python.


### BUILDING THE STRUCTURE OF BAYESIAN NETWORK: Using PgmPy
#### Install Package: `!pip install pgmpy`

In [18]:
# Including the necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator, BayesianEstimator
from pgmpy.inference import VariableElimination
import networkx as nx

# Factoring the dataset

In [19]:
# Import data, make a copy of the original

df0 = pd.read_csv('precode/seattle-weather.csv')
dfc1 = df0.copy()
dfc1.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain


In [20]:
# Get characteristics of dataset including columns with missing data as well:
dfc1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           1461 non-null   object 
 1   precipitation  1461 non-null   float64
 2   temp_max       1461 non-null   float64
 3   temp_min       1461 non-null   float64
 4   wind           1461 non-null   float64
 5   weather        1461 non-null   object 
dtypes: float64(4), object(2)
memory usage: 68.6+ KB


In [21]:
# Checking the unique values in the 'weather' column
unique_fields = dfc1['weather'].unique()
print(unique_fields)

['drizzle' 'rain' 'sun' 'snow' 'fog']


In [22]:
dfc1.describe()

Unnamed: 0,precipitation,temp_max,temp_min,wind
count,1461.0,1461.0,1461.0,1461.0
mean,3.029432,16.439083,8.234771,3.241136
std,6.680194,7.349758,5.023004,1.437825
min,0.0,-1.6,-7.1,0.4
25%,0.0,10.6,4.4,2.2
50%,0.0,15.6,8.3,3.0
75%,2.8,22.2,12.2,4.0
max,55.9,35.6,18.3,9.5


In [23]:
# Put categorical varaibles in a list
categorical_lst = ['date','weather']
# Create a seperate & smaller dataframe for categorical variables
dfc2a = pd.DataFrame(dfc1, columns=categorical_lst, copy=True)
dfc2a.head()

Unnamed: 0,date,weather
0,2012-01-01,drizzle
1,2012-01-02,rain
2,2012-01-03,rain
3,2012-01-04,rain
4,2012-01-05,rain


In [24]:
# Put all continuous variables into a list
continuous_lst = ['precipitation', 'temp_max', 'temp_min', 'wind']
# Create a seperate & smaller dataframe for our chosen variables. Use 'copy=True' so changes wont affect original
dfc2b = pd.DataFrame(dfc1, columns=continuous_lst, copy=True)
dfc2b.head()

Unnamed: 0,precipitation,temp_max,temp_min,wind
0,0.0,12.8,5.0,4.7
1,10.9,10.6,2.8,4.5
2,0.8,11.7,7.2,2.3
3,20.3,12.2,5.6,4.7
4,1.3,8.9,2.8,6.1


### Create new dataframe

In [25]:
# Create new df with variables we want to work with:
new_cols = ['date', 'precipitation', 'temp_max', 'temp_min', 'wind', 'weather']

df = df0[new_cols]
# df.head()

In [26]:
# Let's show all columns with missing data as well:
df[df.isnull().any(axis=1)] # any missing data in columns
df.isnull().any()

date             False
precipitation    False
temp_max         False
temp_min         False
wind             False
weather          False
dtype: bool

In [27]:
num_stdv = 1

# Define the labels dictionary
labels = {

}

# Create bounds for continuous labels


df.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain


# Creating Bayes Nets

In [28]:
# Define the hierarchy
weather_model_h1 = BayesianNetwork([
    ('weather', 'precipitation'),
    ('weather', 'wind'),
    ('precipitation', 'temp_max'),
    ('wind', 'temp_max'),
    ('wind', 'temp_min')
])

#weather_model.add_nodes_from(['a', 'b'])

weather_model_h1.fit(df, estimator=MaximumLikelihoodEstimator)

infer = VariableElimination(weather_model_h1)
#prob_weather_max_temp = infer.query(variables=['weather'], evidence={'temp_max': 10})
#prob_weather_min_temp = infer.query(variables=['weather'], evidence={'temp_min': -3.3})
#prob_high_wind_given_sunny = infer.query(variables=['wind'],evidence={'weather': 'sun'},joint=False)
prob_sunny_given_high_wind = infer.query(variables=['weather'], evidence={'wind': 5})

print(prob_sunny_given_high_wind)

#nx.draw(weather_model, with_labels=True)
#plt.show()


+------------------+----------------+
| weather          |   phi(weather) |
| weather(drizzle) |         0.0000 |
+------------------+----------------+
| weather(fog)     |         0.0556 |
+------------------+----------------+
| weather(rain)    |         0.7778 |
+------------------+----------------+
| weather(snow)    |         0.1111 |
+------------------+----------------+
| weather(sun)     |         0.0556 |
+------------------+----------------+


In [29]:
# Create clasifiers

#df['wind_category'] = pd.qcut(df['wind'], q=3, labels=['low', 'medium', 'high'])

print(df)



            date  precipitation  temp_max  temp_min  wind  weather
0     2012-01-01            0.0      12.8       5.0   4.7  drizzle
1     2012-01-02           10.9      10.6       2.8   4.5     rain
2     2012-01-03            0.8      11.7       7.2   2.3     rain
3     2012-01-04           20.3      12.2       5.6   4.7     rain
4     2012-01-05            1.3       8.9       2.8   6.1     rain
...          ...            ...       ...       ...   ...      ...
1456  2015-12-27            8.6       4.4       1.7   2.9     rain
1457  2015-12-28            1.5       5.0       1.7   1.3     rain
1458  2015-12-29            0.0       7.2       0.6   2.6      fog
1459  2015-12-30            0.0       5.6      -1.0   3.4      sun
1460  2015-12-31            0.0       5.6      -2.1   3.5      sun

[1461 rows x 6 columns]


In [30]:
weather_model_h1 = BayesianNetwork([
    ('weather', 'precipitation'),
    ('weather', 'wind'),
    ('precipitation', 'temp_max'),
    ('wind', 'temp_max'),
    ('wind', 'temp_min')
])

weather_model_h1.fit(df, estimator=MaximumLikelihoodEstimator)

infer = VariableElimination(weather_model_h1)
#prob_weather_max_temp = infer.query(variables=['weather'], evidence={'temp_max': 10})
#prob_weather_min_temp = infer.query(variables=['weather'], evidence={'temp_min': -3.3})
#prob_high_wind_given_sunny = infer.query(variables=['wind'],evidence={'weather': 'sun'},joint=False)
#prob_sunny_given_high_wind = infer.query(variables=['weather'], evidence={'wind': 5})

joint_probabilities = infer.query(
    variables=['weather', 'precipitation', 'wind'],
    joint=True
)



In [31]:
# Perform the query for the joint probability distribution
joint_probabilities = infer.query(variables=['weather', 'precipitation', 'wind'])

# Extract the assignment and probabilities from the joint distribution
probabilities = joint_probabilities.values
state_names = joint_probabilities.state_names

# Find the index of the maximum probability
max_prob_index = probabilities.argmax()

# Convert flat index to multi-dimensional index
# This step is necessary because the joint probability table is multi-dimensional
max_prob_multi_index = np.unravel_index(max_prob_index, probabilities.shape)

# Extract the states that correspond to this maximum probability using the multi-dimensional index
max_prob_states = {
    variable: state_names[variable][index]
    for variable, index in zip(joint_probabilities.variables, max_prob_multi_index)
}

# Extract the maximum probability value
max_probability = probabilities.flatten()[max_prob_index]

print("The most probable condition and its probability are:")
print(max_prob_states)
print(f"Probability: {max_probability}")


The most probable condition and its probability are:
{'weather': 'sun', 'precipitation': 0.0, 'wind': 2.6}
Probability: 0.02532511978097193


In [32]:
weather_probabilities_given_medium_precipitation = infer.query(
    variables=['weather'],
    evidence={'precipitation': 0.8}
)

print(weather_probabilities_given_medium_precipitation)


+------------------+----------------+
| weather          |   phi(weather) |
| weather(drizzle) |         0.0000 |
+------------------+----------------+
| weather(fog)     |         0.0000 |
+------------------+----------------+
| weather(rain)    |         0.9565 |
+------------------+----------------+
| weather(snow)    |         0.0435 |
+------------------+----------------+
| weather(sun)     |         0.0000 |
+------------------+----------------+


In [33]:
# Calculate Probabilities

# Weather does not have any parents so all we need are the marginal probabilities of observing each weather type

# Joint Propabilities
# Create dict where key=parent, value=child
var_dict = {

            }

# Create conditional distributions and store results in a list
cpd_lst = []
for key, value in var_dict.items():
    ### Define yourself
    print(key)

# Note that we get 3 Nan values in the above conditional distributions. This is because one of the type of precipitation (low) did not contain any relation with temp_max.
# Therefore, normalization, does not produce the intended result.
# To mitigate this, we replace Nan with the equal probability within the three values, i.e., 0.33
cpd_lst[2][:,0] = .33

cpd_lst

IndexError: list index out of range

In [34]:
# Creating tabular conditional probability distribution



In [35]:
# Add CPDs and factors to the model


# Check if model is consistent


In [36]:
# Viewing nodes of the model
weather_model_h1.nodes()

NodeView(('weather', 'precipitation', 'wind', 'temp_max', 'temp_min'))

In [37]:
# Viewing edges of the model
weather_model_h1.edges()

OutEdgeView([('weather', 'precipitation'), ('weather', 'wind'), ('precipitation', 'temp_max'), ('wind', 'temp_max'), ('wind', 'temp_min')])

In [38]:
# Print the probability table of the weather node
print(weather_cpd)

# Print the probability table of the wind node
print(wind_cpd)

NameError: name 'weather_cpd' is not defined

In [39]:
# Independcies in the model

# Checking independcies of a particular node


# Task 1.2

In [40]:
from pgmpy.inference import VariableElimination

In [41]:
# Question 1: (a) What is the probability of high wind when the weather is sunny? (b) What is the probability of sunny weather when the wind is high?


In [42]:
# Question 2:
# (a) Calculate all the possible joint probability and determine the best probable condition. Explain your results?

# (b) What is the most probable condition for precipitation, wind and weather, combined?


In [43]:
# Question 3. Find the probability associated with each weather, given that the precipitation is medium? Explain your result.



In [44]:
# Question 4. What is the probability of each weather condition given that precipitation is medium and wind is low or medium? Explain your method and results. How does the result change with the addition of wind factor compared to question 3 of Task 1.2?



# Task 1.3 - Approximate Inference

## Likelihood Weighted Sample

Generates weighted sample(s) from joint distribution of the Bayesian Network, that comply with the given evidence.

In [45]:
from pgmpy.factors.discrete import State
from pgmpy.sampling import BayesianModelSampling

In [46]:
# Repeat Q.1. (a) of Task 1.2 - What is the probability of high wind when the weather is sunny?



In [47]:
# Repeat Q.1. (b) of Task 1.2 - What is the probability of sunny weather when the wind is high?



## Rejection Sampling

In [48]:
# Repeat Q.2 . (a) of Task 1.2 - Calculate all the possible joint probability and determine the best probable condition. Explain your results?



In [49]:
# Repeat Q.2 . (b) of Task 1.2 - What is the most probable condition for precipitation, wind and weather, combined?



## Approx Inference

In [50]:
from pgmpy.inference import ApproxInference

In [51]:
# Repeat Q.3 of Task 1.2 - Find the probability associated with each weather, given that the precipitation is medium? Explain your result.



# Normal Sampling

In [52]:
# Repeat Q.4 of Task 1.2 - What is the probability of each weather condition given that precipitation is medium and wind is low or medium? Explain your method and results. How does the result change with the addition of wind factor compared to question 3 of Task 1.2?

