# Importing packages

In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pickle

# Reading dataset and analyzing

In [3]:
df = pd.read_csv('covid-dataset.csv')


In [4]:
df.head()

Unnamed: 0,Breathing Problem,Fever,Dry Cough,Sore throat,Running Nose,Asthma,Chronic Lung Disease,Headache,Heart Disease,Diabetes,...,Fatigue,Gastrointestinal,Abroad travel,Contact with COVID Patient,Attended Large Gathering,Visited Public Exposed Places,Family working in Public Exposed Places,Wearing Masks,Sanitization from Market,COVID-19
0,Yes,Yes,Yes,Yes,Yes,No,No,No,No,Yes,...,Yes,Yes,No,Yes,No,Yes,Yes,No,No,Yes
1,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,No,No,...,Yes,No,No,No,Yes,Yes,No,No,No,Yes
2,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No,Yes,...,Yes,Yes,Yes,No,No,No,No,No,No,Yes
3,Yes,Yes,Yes,No,No,Yes,No,No,Yes,Yes,...,No,No,Yes,No,Yes,Yes,No,No,No,Yes
4,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,...,No,Yes,No,Yes,No,Yes,No,No,No,Yes


In [6]:
df = df.replace({'No': 0, 'Yes': 1})

In [39]:
df.describe()

Unnamed: 0,Breathing Problem,Fever,Dry Cough,Sore throat,Running Nose,Asthma,Chronic Lung Disease,Headache,Heart Disease,Diabetes,...,Fatigue,Gastrointestinal,Abroad travel,Contact with COVID Patient,Attended Large Gathering,Visited Public Exposed Places,Family working in Public Exposed Places,Wearing Masks,Sanitization from Market,COVID-19
count,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,...,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0
mean,0.666176,0.786345,0.792602,0.727457,0.543246,0.462643,0.472028,0.503497,0.464299,0.476261,...,0.519139,0.469452,0.451049,0.501656,0.461907,0.518955,0.416268,0.0,0.0,0.806588
std,0.471621,0.409924,0.40548,0.445309,0.498172,0.498648,0.499263,0.500034,0.49877,0.499482,...,0.49968,0.499112,0.497644,0.500043,0.498593,0.499687,0.492984,0.0,0.0,0.395009
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5434 entries, 0 to 5433
Data columns (total 21 columns):
 #   Column                                   Non-Null Count  Dtype
---  ------                                   --------------  -----
 0   Breathing Problem                        5434 non-null   int64
 1   Fever                                    5434 non-null   int64
 2   Dry Cough                                5434 non-null   int64
 3   Sore throat                              5434 non-null   int64
 4   Running Nose                             5434 non-null   int64
 5   Asthma                                   5434 non-null   int64
 6   Chronic Lung Disease                     5434 non-null   int64
 7   Headache                                 5434 non-null   int64
 8   Heart Disease                            5434 non-null   int64
 9   Diabetes                                 5434 non-null   int64
 10  Hyper Tension                            5434 non-null   int64
 11  Fati

In [41]:
df['Breathing Problem'].value_counts()

1    3620
0    1814
Name: Breathing Problem, dtype: int64

# Splitting into training and test set

In [8]:
def data_split(data, ratio):
    np.random.seed(35)
    shuffled_data = np.random.permutation(len(data))
    test_set_size = int(len(data) * ratio)
    test_indices = shuffled_data[:test_set_size]
    train_indices = shuffled_data[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [9]:
train_data, test_data = data_split(df, 0.3)

In [10]:
output_var = ['COVID-19']

In [20]:
x_train = train_data.iloc[:,:20].to_numpy()
y_train = train_data[output_var].to_numpy().ravel()
x_test = test_data.iloc[:,:20].to_numpy()
y_test = test_data[output_var].to_numpy().ravel()

In [21]:
x_train.shape

(3804, 20)

In [22]:
y_train.shape

(3804,)

# Model building

In [23]:
regressor = LogisticRegression(solver='lbfgs', multi_class='multinomial')
regressor.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

# Prediction and Accuracy checking

In [34]:
feature = [[1,0,1,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0]]
regressor.predict(feature)

array([0], dtype=int64)

In [25]:
prediction = regressor.predict(x_test)
metrics.accuracy_score(prediction,y_test)

0.9619631901840491

In [35]:
inf_prob = regressor.predict_proba(feature)[0][1]

In [36]:
inf_prob

0.08235985483876591

In [82]:
regressor.predict_proba(x_test)[:, 1]

array([0.99909055, 0.99999999, 0.99999122, ..., 0.99993317, 0.99999952,
       0.99044689])

## Saving the model

In [83]:


# open a file, where you ant to store the data
file = open('model.pkl', 'wb')

# dump information to that file
pickle.dump(regressor, file)

# close the file
file.close()

In [84]:
# open a file, where you stored the pickled data
file = open('model.pkl', 'rb')

# dump information to that file
regressor = pickle.load(file)

# close the file
file.close()

In [85]:
regressor.predict_proba(feature)[0][1]

0.9999575409927919