In [1]:
import pandas as pd

# Load the dataset
file_path = 'MY.DENGUE.csv'
dengue_data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
dengue_data.head()


Unnamed: 0,ConditionName,ConditionSNOMED,PathogenName,PathogenTaxonID,Fatalities,CountryName,CountryISO,Admin1Name,Weeks,PeriodStartDate,PeriodEndDate,CountValue
0,Dengue,38362002,Dengue virus,12637,0,MALAYSIA,MY,JOHOR,1,1/1/2010,7/1/2010,2
1,Dengue,38362002,Dengue virus,12637,0,MALAYSIA,MY,JOHOR,2,8/1/2010,14/1/2010,5
2,Dengue,38362002,Dengue virus,12637,0,MALAYSIA,MY,JOHOR,3,15/1/2010,21/1/2010,4
3,Dengue,38362002,Dengue virus,12637,0,MALAYSIA,MY,JOHOR,4,22/1/2010,28/1/2010,6
4,Dengue,38362002,Dengue virus,12637,0,MALAYSIA,MY,JOHOR,5,29/1/2010,4/2/2010,7


In [3]:
# Check for missing values in the dataset
missing_values = dengue_data.isnull().sum()

# Feature Engineering: Convert dates to a more usable format
# We'll convert 'PeriodStartDate' to a datetime object and extract the year and month
# This assumes that the year and month are more relevant than the exact date for the prediction

dengue_data['PeriodStartDate'] = pd.to_datetime(dengue_data['PeriodStartDate'], format='%d/%m/%Y')
dengue_data['Year'] = dengue_data['PeriodStartDate'].dt.year
dengue_data['Month'] = dengue_data['PeriodStartDate'].dt.month

# Drop columns that may not be relevant or are redundant
# 'ConditionName', 'ConditionSNOMED', 'PathogenName', 'PathogenTaxonID', 'CountryName', 'CountryISO', 'PeriodEndDate'
# are dropped as they do not vary or are not relevant for the prediction

dengue_data_cleaned = dengue_data.drop(['ConditionName', 'ConditionSNOMED', 'PathogenName', 'PathogenTaxonID', 
                                        'CountryName', 'CountryISO', 'PeriodEndDate'], axis=1)

# Display the missing values and the first few rows of the updated dataset
missing_values, dengue_data_cleaned.head()


(ConditionName      0
 ConditionSNOMED    0
 PathogenName       0
 PathogenTaxonID    0
 Fatalities         0
 CountryName        0
 CountryISO         0
 Admin1Name         0
 Weeks              0
 PeriodStartDate    0
 PeriodEndDate      0
 CountValue         0
 dtype: int64,
    Fatalities Admin1Name  Weeks PeriodStartDate  CountValue  Year  Month
 0           0      JOHOR      1      2010-01-01           2  2010      1
 1           0      JOHOR      2      2010-01-08           5  2010      1
 2           0      JOHOR      3      2010-01-15           4  2010      1
 3           0      JOHOR      4      2010-01-22           6  2010      1
 4           0      JOHOR      5      2010-01-29           7  2010      1)

In [4]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

# One-Hot Encoding for 'Admin1Name'
one_hot_encoder = OneHotEncoder(sparse=False)
admin1_encoded = one_hot_encoder.fit_transform(dengue_data_cleaned[['Admin1Name']])
admin1_encoded_df = pd.DataFrame(admin1_encoded, columns=one_hot_encoder.get_feature_names_out(['Admin1Name']))

# Normalization for numerical features
scaler = MinMaxScaler()
numerical_features = ['Weeks', 'Year', 'Month', 'Fatalities']
dengue_data_cleaned[numerical_features] = scaler.fit_transform(dengue_data_cleaned[numerical_features])

# Combine the one-hot encoded and normalized features
processed_data = pd.concat([dengue_data_cleaned.drop(['Admin1Name'], axis=1), admin1_encoded_df], axis=1)

# Display the first few rows of the processed dataset
processed_data.head()




Unnamed: 0,Fatalities,Weeks,PeriodStartDate,CountValue,Year,Month,Admin1Name_JOHOR,Admin1Name_KEDAH,Admin1Name_KELANTAN,Admin1Name_MELAKA,...,Admin1Name_PAHANG,Admin1Name_PERAK,Admin1Name_PERLIS,Admin1Name_PULAU PINANG,Admin1Name_SABAH,Admin1Name_SARAWAK,Admin1Name_SELANGOR,Admin1Name_TERENGGANU,Admin1Name_WP KUALA LUMPUR,Admin1Name_WP LABUAN
0,0.0,0.0,2010-01-01,2,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.019608,2010-01-08,5,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.039216,2010-01-15,4,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.058824,2010-01-22,6,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.078431,2010-01-29,7,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
from sklearn.model_selection import train_test_split

# Separate the features and the target variable
X = processed_data.drop(['CountValue', 'PeriodStartDate'], axis=1)  # Dropping 'PeriodStartDate' as we have extracted Year and Month
y = processed_data['CountValue']

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the sizes of the training and testing sets
(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


((7488, 19), (1872, 19), (7488,), (1872,))

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define the ANN model
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))  # Input layer
model.add(Dense(64, activation='relu'))  # Hidden layer
model.add(Dense(32, activation='relu'))  # Another hidden layer
model.add(Dense(1, activation='linear'))  # Output layer for regression

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=1)

# Evaluate the model
loss = model.evaluate(X_test, y_test, verbose=0)
print(f'Model Loss on Test Data: {loss}')

# Saving the model
model.save('my_model.h5')

NameError: name 'X_train' is not defined

In [3]:
import tkinter as tk
from tkinter import ttk
from keras.models import load_model
import numpy as np

# Load your trained model (replace 'your_model.h5' with the path to your model)
model = load_model('fyp4.ipynb')

def predict_dengue_cases():
    # Retrieve user inputs
    state = state_var.get()
    week = int(week_var.get())
    year = int(year_var.get())
    # ... (other inputs as necessary)

    # Process inputs (One-hot encoding for state, scaling for numerical inputs)
    # Note: Ensure the input processing here matches the preprocessing done during training
    # processed_input = ...

    # Make prediction
    prediction = model.predict(np.array([processed_input]))
    prediction_label.config(text=f'Predicted Dengue Cases: {prediction[0]}')

# Initialize main window
root = tk.Tk()
root.title("Dengue Outbreak Prediction")
root.geometry("400x300")

# User input widgets
state_var = tk.StringVar()
state_label = tk.Label(root, text="State")
state_label.pack()
state_entry = ttk.Combobox(root, textvariable=state_var)
state_entry['values'] = ('State1', 'State2', 'State3')  # Replace with actual state names
state_entry.pack()

week_var = tk.StringVar()
week_label = tk.Label(root, text="Week")
week_label.pack()
week_entry = tk.Entry(root, textvariable=week_var)
week_entry.pack()

year_var = tk.StringVar()
year_label = tk.Label(root, text="Year")
year_label.pack()
year_entry = tk.Entry(root, textvariable=year_var)
year_entry.pack()

# Prediction result
prediction_label = tk.Label(root, text="Predicted Dengue Cases: ")
prediction_label.pack()

# Predict button
predict_button = tk.Button(root, text="Predict", command=predict_dengue_cases)
predict_button.pack()

# Run the application
root.mainloop()


OSError: Unable to synchronously open file (file signature not found)