In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import sklearn
from sklearn.preprocessing import LabelEncoder

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Importing the data from CSV file

In [None]:
data = pd.read_csv('../input/symptoms-and-covid-presence/Covid Dataset.csv')
data.head()

Analyzing the features data

In [None]:
data.columns

In [None]:
data.isnull().sum()

There are no empty values present in any of the feature

In [None]:
data.describe()

In [None]:
for col in data:
    print(f'{col} : {data[col].unique()}')
    print()

There is only one value for the features 'Wearing Masks' and 'Sanitization from Market', so it does not show any effect on the predictand removing those columns

In [None]:
data = data.drop(['Wearing Masks','Sanitization from Market'], axis = 1)

Endcoding all the features values with help of Label Encoder

In [None]:
#connvert categorical data into dummy indicators
columns = data.columns

cat_col=data.select_dtypes(include=['object','category']).columns.tolist()
print(cat_col)
encoder = LabelEncoder()

#intiate LabelEncoder
for col in cat_col:
    data[col] = pd.DataFrame(encoder.fit_transform(data[[col]]))

Checking the values after encoding

In [None]:
data.head()

Plotting heatmap with corelations to identify and deal with weakly correlated values

In [None]:
import matplotlib.pyplot as plt
import seaborn as sb

plt.figure(figsize=(18,18))
sb.heatmap(data.corr(),annot=True, mask=np.triu(data.corr()))

Running nose,Headache, Heart disease, Fatigue and Gastrointestinal features have a very less correlation with the predictand(COVID-19). So removing them from the Dataframe before we develop model

In [None]:
data = data.drop(['Running Nose','Headache','Heart Disease','Fatigue ','Gastrointestinal '],axis = 1)
print(data.shape)
data.head()

Checking the distribution of predictant(COVID-19) values to check if the data is biased or not

In [None]:
sb.countplot(x = 'COVID-19', data = data)
plt.show()

From the above countplot, the values 0 and 1 are not equally distributed and data is biased towards 1.

Splitting the data for training and testing

In [None]:
from sklearn.model_selection import train_test_split
y = data['COVID-19'].values
X= data.drop(columns=['COVID-19'],axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=30)

Using Smote algorithm to overcome the issues of imbalenced data on COVID-19 column

In [None]:
from collections import Counter
from imblearn.over_sampling import SMOTE

counter = Counter(y_train)
print('Before',counter)

smt = SMOTE()

X_train_sm, y_train_sm = smt.fit_resample(X_train,y_train)

counter = Counter(y_train_sm)
print('After',counter)

Developing the model with the train data after smoting and finding the predictands for test data

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

y_pred = gnb.fit(X_train_sm, y_train_sm).predict(X_test)

Importing the f1_score metrics method and checking the metrics score for y_pred

In [None]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, zero_division=1)

Checking the columns order and taking corresponding values from the user to predict his condition

In [None]:
data.columns

Taking a record from the test data to check the accuracy

In [None]:
print(X_test[180])
print(y_test[180])

#Use this commented logic if you want to take inputs from user and give him the result. You can integrate this logic in GUI and present it like a chatbot

questions = [
{'ques':'Do you have any Breathing problem? Y/N '},
{'ques':'Do you have Fever? Y/N '},
{'ques':'Do you have Dry Cough? Y/N '},
{'ques':'Do you have Sore throat? Y/N '},
{'ques':'Do you have Asthma? Y/N '},
{'ques':'Do you have Chronic Lung Disease? Y/N '},
{'ques':'Do you have Diabetes? Y/N '},
{'ques':'Do you have Hyper Tension? Y/N '},
{'ques':'Have you travelled Abroad recently? Y/N '},
{'ques':'Do you have any Contact with COVID Patient? Y/N '},
{'ques':'Have you Attended Large Gathering recently? Y/N '},
{'ques':'Have you Visited Public Exposed Places recently? Y/N '},
{'ques':'Do you have any Family member working in Public Exposed Places? Y/N'}
 ]

symp = []

for i in range(len(questions)):
    while(1):
        res = input(questions[i]['ques'])
        if res.lower() == 'y' or  res.lower() == 'yes':
            symp.append(1)
            break
        elif res.lower() == 'n' or res.lower() == 'no':
            symp.append(0)
            break
        else:
            print('Enter valid value: (Y/N or Yes/No)')
            continue

x=np.array([[symp[0],symp[1],symp[2],symp[3],symp[4],symp[5],symp[6],symp[7],symp[8],symp[9],symp[10],symp[11],symp[12]]])

y_pred = gnb.predict(x)

In [None]:
x=np.array([[0,1,0,1,0,0,0,0,0,0,0,0,0]])

y_pred = gnb.predict(x)

Predicting the patients condition and giving him instructions with respect to the predict value

In [None]:
if y_pred[0] == 0:
    print('You do not have any symptoms of COVID-19. Stay safe! Stay home!')
else:
    print('You may be affected with COVID-19 virus! Please get RTPCR test ASAP and stay in Quarantine for 14days!')