# Dengue Disease Detection


In [None]:
import pandas as pd
import seaborn as sns

## Reading data

In [None]:
data=pd.read_csv("C:/Users/sanab/Documents/clg_pro_dengu_detection/code/dataset.csv")

## top 5 rows

In [None]:
data.head()

In [None]:
data.drop(columns={'id'},inplace=True)

In [None]:
get={'yes':1,'medium':1,'no':0,'high':2,'low':0}


In [None]:
data.columns

## data cleaning

In [None]:
data.vomiting=data.vomiting.map(get)
data.nausea=data.nausea.map(get)
data.vomiting_blood=data.vomiting_blood.map(get)
data.body_pains=data.body_pains.map(get)
data.pain_behind_eyes=data.pain_behind_eyes.map(get)
data.joint_pains=data.joint_pains.map(get)
data.chill=data.chill.map(get)
data.headache=data.headache.map(get)
data.swollen_glands=data.swollen_glands.map(get)
data.rashes=data.rashes.map(get)
data.abdominal_pain=data.abdominal_pain.map(get)
data.ble_nose=data.ble_nose.map(get)
data.ble_mouth=data.ble_mouth.map(get)
data.fatigue=data.fatigue.map(get)
data.red_eyes=data.red_eyes.map(get)
data.dengue=data.dengue.map(get)

In [None]:
data=data[:225]

In [None]:
data.head()

In [None]:

data_n = data[data['dengue'] == 0]
data_y = data[data['dengue'] == 1]

c=0
for i in data.dengue:
    if i==1:
        c=c+1
score=c/data.shape[0]
print('the bechmark model accuarcy score  {}%'.format(score*100))

## data visualization

In [None]:
import matplotlib.pyplot as plt

In [None]:

# Instantiate figure object
fig = plt.figure()
#plt.sup_title('Tumor Characteristic (means)')

# Create 'for loop' to enerate though tumor features and compare with histograms
for i,b in enumerate(list(data.columns[0:16])):
    
    # Enumerate starts at index 0, need to add 1 for subplotting
    i +=1
    
    # Create axes object for position i
    ax = fig.add_subplot(4,4,i)
    
    # Plot via histogram tumor charateristics using stacked and alpha parameters for..
    # comparisons.
    ax.hist(data_n[b], label = 'Negative', stacked = True, alpha=0.5, color= 'g')
    ax.hist(data_y[b], label= 'Positive', stacked = True, alpha=0.5, color= 'r')
    ax.set_title(b)



plt.tight_layout()
#plt.legend()
plt.show()   

## heat map generation

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(data.corr(),annot=True)


## Data Modeling

In [None]:
## to split the data into ratio of 75% and 25% to train model and test the model

from sklearn.model_selection import train_test_split

new_data=data.drop(columns={'dengue'})
X_train,X_test,y_train,y_test=train_test_split(new_data,data['dengue'],random_state=7)

In [None]:
#The data is preprocessing using sklearn.preprocessing.Normalizer
from sklearn.preprocessing import Normalizer

norm=Normalizer()
X_train_normal=norm.transform(X_train)
X_test_normal=norm.transform(X_test)

In [None]:
print(X_test_normal)

### Hyperparameter tuning

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression


RF_params = {'n_estimators':[10,50,100]}
DTC_params = {'criterion':['entropy'], 'max_depth':[10, 50, 100]}
LR_params = {'C':[0.001, 0.1, 1, 10, 100]}

In [None]:
from sklearn.model_selection import KFold, cross_val_score

In [None]:
from sklearn.model_selection import GridSearchCV
models=[]

models.append(('DTC', DecisionTreeClassifier(), DTC_params))

models.append(('LR', LogisticRegression(), LR_params))

In [None]:
from tqdm import tqdm
results=[]
names=[]
scoring='accuracy' 
for name, model, params in tqdm(models):
    kfold = KFold(len(X_train_normal), random_state=7, shuffle=True)
    model_grid = GridSearchCV(model, params)
    cv_results = cross_val_score(model_grid, X_train_normal, y_train, cv = kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "Cross Validation Accuracy %s: Accarcy: %f SD: %f" % (name, cv_results.mean(), cv_results.std())
    print(msg)

### White Grid Plot

In [None]:
plt.boxplot(results, labels = names)
plt.title('Dengue Diagnosis Performance using Machine Learning ')
plt.ylabel('Model Accuracy %')
sns.set_style("whitegrid")
plt.show()

### Accuracy

In [None]:
#The accuracy score obtained without using GridSearchCV

from sklearn.metrics import make_scorer, accuracy_score, fbeta_score

clf=RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_normal,y_train)
pred=clf.predict(X_test_normal)
accuracy_score(pred,y_test)
print(pred)

### Data metric evaluation

In [None]:
from sklearn.metrics import make_scorer, accuracy_score, fbeta_score

clf = DecisionTreeClassifier(random_state=42)

# TODO: Create the parameters list you wish to tune
parameters = {'criterion':['entropy'], 'max_depth':[10, 50, 100]}

# TODO: Make an fbeta_score scoring object


# TODO: Perform grid search on the classifier using 'scorer' as the scoring method
grid_obj = GridSearchCV(clf,parameters,scoring='accuracy')

# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_fit = grid_obj.fit(X_train_normal,y_train)

# Get the estimator
best_clf = grid_fit.best_estimator_

# Make predictions using the unoptimized and model
predictions = (clf.fit(X_train_normal, y_train)).predict(X_test_normal)
best_predictions = best_clf.predict(X_test_normal)

# Report the before-and-afterscores
print("Unoptimized model\n------")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)))
print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5)))
print("\nOptimized Model\n------")
print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)))

###  Confusion Matrix and Classification report

In [None]:
print(best_predictions)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

matrix=confusion_matrix(pred,y_test)
matrix=pd.DataFrame(matrix,columns=['Predicted Negative','Predicted Postive'],index=['Actual Negative','Actual Positive'])
print(matrix)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(best_predictions,y_test))

In [None]:
data.shape

In [None]:

A=best_clf.predict([[102.5,0,0,0,1,0,0,1,1,0,1,0,0,0,1,0,290000]])
print(A)

In [None]:
data.head()

In [None]:
best_clf.predict([[102.5,0,0,0,1,0,0,1,1,0,1,0,0,0,1,0,290000]])

In [None]:
best_clf.predict([[103.4,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,6000]])

In [None]:
best_clf.predict([[2.65640088e-03,2.56409351e-05,2.56409351e-05,0.00000000e+00,5.12818702e-05,0.00000000e+00,0.00000000e+00,0.00000000e+00,2.56409351e-05,0.00000000e+00,0.00000000e+00,0.00000000e+00,0.00000000e+00,0.00000000e+00,2.56409351e-05,0.00000000e+00,9.99996469e-01]])

In [None]:
best_clf.predict([[6.73999847e-04,0.00000000e+00,0.00000000e+00,0.00000000e+00,0.00000000e+00,0.00000000e+00,0.00000000e+00,6.66666515e-06,6.66666515e-06,0.00000000e+00,0.00000000e+00,0.00000000e+00,0.00000000e+00,0.00000000e+00,6.66666515e-06,0.00000000e+00,9.99999773e-01]])

In [None]:
best_clf.predict([[1.04094354e-02,0.00000000e+00,9.99945760e-05,9.99945760e-05,9.99945760e-05,0.00000000e+00,9.99945760e-05,9.99945760e-05,1.99989152e-04,0.00000000e+00,0.00000000e+00,9.99945760e-05,0.00000000e+00,9.99945760e-05,9.99945760e-05,0.00000000e+00,9.99945760e-01]])

In [None]:
best_clf.predict([[2.97645740e-03,2.94116344e-05,2.94116344e-05,2.94116344e-05,0.00000000e+00,0.00000000e+00,0.00000000e+00,0.00000000e+00,2.94116344e-05,0.00000000e+00,0.00000000e+00,0.00000000e+00,0.00000000e+00,0.00000000e+00,2.94116344e-05,0.00000000e+00,9.99995568e-01]])

In [None]:
best_clf.predict([[4.50909045e-04,4.54545408e-06,0.00000000e+00,0.00000000e+00,0.00000000e+00,0.00000000e+00,0.00000000e+00,4.54545408e-06,4.54545408e-06,0.00000000e+00,0.00000000e+00,0.00000000e+00,0.00000000e+00,0.00000000e+00,0.00000000e+00,0.00000000e+00,9.99999898e-01]])

In [None]:
best_clf.predict([[4.56249952e-04,4.46428525e-06,4.46428525e-06,4.46428525e-06,0.00000000e+00,0.00000000e+00,0.00000000e+00,0.00000000e+00,4.46428525e-06,0.00000000e+00,4.46428525e-06,0.00000000e+00,0.00000000e+00,0.00000000e+00,4.46428525e-06,0.00000000e+00,9.99999896e-01]])

In [None]:
best_clf.predict([[9.92332036e-03,0.00000000e+00,9.52334008e-05,0.00000000e+00,1.90466802e-04,0.00000000e+00,1.90466802e-04,0.00000000e+00,0.00000000e+00,0.00000000e+00,9.52334008e-05,9.52334008e-05,0.00000000e+00,0.00000000e+00,0.00000000e+00,9.52334008e-05,9.99950708e-01]])

In [None]:
tval=norm.transform([[102.5,0,0,0,1,0,0,1,1,0,1,0,0,0,1,0,290000]])
print(tval)

In [None]:
element = str(tval[[0]])
element = element.strip('[').strip(']').split()
print(element)

In [None]:
type(element)

In [None]:
best_clf.predict([element])

In [None]:
data.head()

In [None]:
tval=norm.transform([[103.5,1,1,1,2,1,1,1,1,1,0,1,1,0,1,0,6800]])
print(tval)
element = str(tval[[0]])
element = element.strip('[').strip(']').split()
print(element)
best_clf.predict([element])