In [19]:
# Libraries for Data Structures
import pandas as pd
import numpy as np

In [20]:
# Libraries for Creating Pipelines
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

In [21]:
# Libraries for Data Pre-processing and Processing
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split,  GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedShuffleSplit

In [22]:
# Libraries For Model Formation
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn import linear_model
# import xgboost as xgb
# from xgboost import XGBClassifier
# from sklearn.cluster import KMeans
# from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

In [23]:
# Libraries for Model Evaluation
from sklearn.model_selection import  cross_val_score
from sklearn.metrics import accuracy_score, classification_report


# Dataset Customisation

In [24]:
# Importing Dataset from csv file
raw_data = pd.read_csv('../Datasets/dengue.csv')

# Gives total no.of rows and columns
raw_data.shape

(980, 10)

In [25]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 980 entries, 0 to 979
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Gender     980 non-null    object
 1   Age        980 non-null    int64 
 2   NS1        980 non-null    int64 
 3   IgG        980 non-null    int64 
 4   IgM        980 non-null    int64 
 5   Area       980 non-null    object
 6   AreaType   980 non-null    object
 7   HouseType  980 non-null    object
 8   District   980 non-null    object
 9   Outcome    980 non-null    int64 
dtypes: int64(5), object(5)
memory usage: 76.7+ KB


## ***DataSet Columns Defination***

```
1. Gender: Gender of the survey respondent.
2. Age: Age of the survey respondent.
3. NS1: NS1 test's data positive as 1 negative as 0.
4. IgG: IgG data positive as 1 negative as 0.
5. IgM: IgM data positive as 1 negative as 0.
6. Area: The area within the Dhaka region where the respondent resides.
7. Area Type: Classification of the area as "Developed" or "Undeveloped."
8. House Type: The type of housing in the area, e.g., "Building," "Tin-Shed," or "Others."
9. District: The specific district within Dhaka.
10. Outcome: Outcome variable indicating the presence (1) or absence (0) of Dengue cases reported.
    
```



# Feature Engineering

## Dropping Non-Required Featues

In [26]:
raw_data['AreaType'].value_counts()

AreaType
Developed      492
Undeveloped    488
Name: count, dtype: int64

In [27]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 980 entries, 0 to 979
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Gender     980 non-null    object
 1   Age        980 non-null    int64 
 2   NS1        980 non-null    int64 
 3   IgG        980 non-null    int64 
 4   IgM        980 non-null    int64 
 5   Area       980 non-null    object
 6   AreaType   980 non-null    object
 7   HouseType  980 non-null    object
 8   District   980 non-null    object
 9   Outcome    980 non-null    int64 
dtypes: int64(5), object(5)
memory usage: 76.7+ KB


## Converting Category into Numerical Values

In [28]:
# Converting Categorical Values to String
raw_data['Gender'] = pd.factorize(raw_data.Gender)[0]
raw_data.drop(['Area', 'District', 'AreaType', 'HouseType'], axis=1, inplace=True)

# raw_data = raw_data[top_corr_features]

# Converting Target Variables to Binary Target Values
raw_data['Outcome'].value_counts()

Outcome
1    527
0    453
Name: count, dtype: int64

In [29]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 980 entries, 0 to 979
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Gender   980 non-null    int64
 1   Age      980 non-null    int64
 2   NS1      980 non-null    int64
 3   IgG      980 non-null    int64
 4   IgM      980 non-null    int64
 5   Outcome  980 non-null    int64
dtypes: int64(6)
memory usage: 46.1 KB


# Dataset Pre-Processing and Processing

In [30]:
# Separate labels and features
X = raw_data.drop(['Outcome'], axis=1)
y = raw_data['Outcome']

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Impute missing values
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = imputer.fit_transform(X_scaled)

# Data Augmentation
smote = SMOTE(random_state=42)
X_imputed_smote, y_smote = smote.fit_resample(X_imputed, y)

# Now, use StratifiedShuffleSplit for splitting the data
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.4, random_state=42)

# Split the data
for train_index, test_index in sss.split(X_imputed_smote, y_smote):
    X_train, X_test = X_imputed_smote[train_index], X_imputed_smote[test_index]
    y_train, y_test = y_smote.iloc[train_index], y_smote.iloc[test_index]

In [31]:
X

Unnamed: 0,Gender,Age,NS1,IgG,IgM
0,0,10,0,0,1
1,0,31,1,1,0
2,1,10,0,0,0
3,0,13,1,1,0
4,0,43,1,1,0
...,...,...,...,...,...
975,0,16,1,1,0
976,1,41,1,1,0
977,1,45,0,0,1
978,0,19,1,1,1


# Model Formations

In [32]:
# Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_params = {'n_estimators': [25, 100, 200], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]}
rf_grid = GridSearchCV(rf_model, rf_params, cv=5)
rf_grid.fit(X_train, y_train)
best_rf_model = rf_grid.best_estimator_

cv_scores = cross_val_score(best_rf_model, X_train, y_train, cv=5)
print(f"{best_rf_model} CV Accuracy: {np.mean(cv_scores) * 100:.2f}%")

# Fit the model on the entire dataset
best_rf_model.fit(X_train, y_train)
y_pred = best_rf_model.predict(X_test)

print("-------------------------------------------------------")
# Training accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"{best_rf_model} Training Accuracy: {accuracy * 100:.2f}%")
print("-------------------------------------------------------")

# Classification report
print(f"{best_rf_model} Classification Report:\n{classification_report(y_test, y_pred)}")
print("-------------------------------------------------------")

RandomForestClassifier(n_estimators=25, random_state=42) CV Accuracy: 100.00%
-------------------------------------------------------
RandomForestClassifier(n_estimators=25, random_state=42) Training Accuracy: 100.00%
-------------------------------------------------------
RandomForestClassifier(n_estimators=25, random_state=42) Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       211
           1       1.00      1.00      1.00       211

    accuracy                           1.00       422
   macro avg       1.00      1.00      1.00       422
weighted avg       1.00      1.00      1.00       422

-------------------------------------------------------


In [33]:
import pickle


filename = "Dengue.sav"
pickle.dump(best_rf_model, open(filename, "wb"))