In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [2]:
df=pd.read_csv('Madedata1.csv')

In [3]:
df.shape

(2499, 12)

In [4]:
df.columns

Index(['Country', 'Age', 'Gender', 'fever', 'Bodypain', 'Runny_nose',
       'Difficulty_in_breathing', 'Nasal_congestion', 'Sore_throat',
       'Severity', 'Contact_with_covid_patient', 'Infected'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0,Country,Age,Gender,fever,Bodypain,Runny_nose,Difficulty_in_breathing,Nasal_congestion,Sore_throat,Severity,Contact_with_covid_patient,Infected
0,China,10,Male,102,1,0,0,0,1,Mild,No,0
1,Italy,20,Male,103,1,1,0,0,0,Moderate,Not known,1
2,Iran,55,Transgender,99,0,0,0,1,1,Severe,No,0
3,Republic of Korean,37,Female,100,0,1,1,0,0,Mild,Yes,1
4,France,45,Male,101,1,1,1,1,0,Moderate,Yes,1


In [6]:
pd.unique(df['Country'])

array(['China', 'Italy', 'Iran', 'Republic of Korean', 'France', 'Spain',
       'Germany', 'UAE', 'Russia', 'Canada', 'Netherlands', 'Mexico',
       'Pakistan', 'Chile', 'Sweden', 'Peru', 'Ireland', 'Singapore',
       'Turkey', 'India', 'Australia', 'Malaysia', 'Argentina', 'Kuwait',
       'Morocco', 'Afghanistan', 'Finland', 'Norway', 'Bangladesh',
       'South Korea', 'Indonesia', 'Denmark', 'Romania', 'Ukraine',
       'Serbia', 'USA', 'Egypt', 'Thailand', 'Iraq', 'New Zealand',
       'Japan', 'South Africa', 'Slovakia', 'Somalia', 'Sri Lanka',
       'Oman', 'Belgium', 'Kazakhstan', 'Bahrain', 'Nigeria', 'Hungary',
       'Philippines', 'Armenia', 'Bulgaria', 'Cuba', 'Sudan', 'Estoria',
       'CostaRica', 'Gabon', 'Maldives', 'Albania', 'Cyprus', 'Kenya',
       'Georgia', 'Jordan', 'SanMario', 'Malta', 'Taiwan', 'Mauritius',
       'Congo', 'Vietnam', 'Zambia', 'Haiti', 'Liberia', 'Nepal', 'Togo',
       'Myanmar', 'Cambodia', 'Uganda', 'Bermuda', 'Monaco', 'Barbados',
    

In [7]:
pd.value_counts(df['Country'])

France         115
Bulgaria        92
China           83
Singapore       71
Vietnam         69
              ... 
Austria          2
Switzerland      2
Czechia          2
Israel           2
Monaco           2
Name: Country, Length: 97, dtype: int64

In [8]:
pd.unique(df['Gender'])

array(['Male', 'Transgender', 'Female'], dtype=object)

In [9]:
pd.unique(df['Severity'])

array(['Mild', 'Moderate', 'Severe'], dtype=object)

In [10]:
pd.unique(df['Contact_with_covid_patient'])

array(['No', 'Not known', 'Yes', 'yes'], dtype=object)

In [11]:
df['Contact_with_covid_patient'] = df['Contact_with_covid_patient'].str.lower()

In [12]:
gender_lab = LabelEncoder()
gender = gender_lab.fit_transform(df['Gender'])

In [13]:
severity_lab = LabelEncoder()
severity = severity_lab.fit_transform(df['Severity'])

In [14]:
contact_lab = LabelEncoder()
contact = contact_lab.fit_transform(df['Contact_with_covid_patient'])

In [15]:
gender_lab

LabelEncoder()

In [16]:
gender_onehot = OneHotEncoder(sparse=False)
gender = gender_onehot.fit_transform(gender.reshape(-1,1))

In [17]:
severity_onehot = OneHotEncoder(sparse=False)
severity = severity_onehot.fit_transform(severity.reshape(-1,1))

In [18]:
contact_onehot = OneHotEncoder(sparse=False)
contact = contact_onehot.fit_transform(contact.reshape(-1,1))

In [19]:
df_updated = df.drop('Gender',axis=1)

In [20]:
df_updated = df_updated.drop('Severity',axis=1)
df_updated = df_updated.drop('Contact_with_covid_patient',axis=1)

In [21]:
df_updated.head()

Unnamed: 0,Country,Age,fever,Bodypain,Runny_nose,Difficulty_in_breathing,Nasal_congestion,Sore_throat,Infected
0,China,10,102,1,0,0,0,1,0
1,Italy,20,103,1,1,0,0,0,1
2,Iran,55,99,0,0,0,1,1,0
3,Republic of Korean,37,100,0,1,1,0,0,1
4,France,45,101,1,1,1,1,0,1


In [22]:
X = df_updated.iloc[:,1:-1].values
y = df['Infected'].values

In [23]:
X = np.c_[X,gender,severity,contact]

In [24]:
X.shape

(2499, 16)

In [25]:
X[0]

array([ 10., 102.,   1.,   0.,   0.,   0.,   1.,   0.,   1.,   0.,   1.,
         0.,   0.,   1.,   0.,   0.])

In [26]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [27]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.25)

In [28]:
reg = LogisticRegression()

In [29]:
reg.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [30]:
y_pred = reg.predict(x_test)

In [31]:
from sklearn.metrics import accuracy_score

In [32]:
accuracy_score(y_test, y_pred)

0.9472

In [33]:
# 'Country', 'Age', 'Gender', 'fever', 'Bodypain', 'Runny_nose',
#        'Difficulty_in_breathing', 'Nasal_congestion', 'Sore_throat',
#        'Severity', 'Contact_with_covid_patient', 'Infected'

In [34]:
u_age = 23
u_gender = 'Male'
u_fever = 98
u_bodypain = 0
u_runny_nose = 0
u_breath = 0
u_nasal = 0
u_throat = 0
u_severity = 'Moderate'
u_contact = 'no'

In [35]:
gender_lab.transform([u_gender])

array([1], dtype=int64)

In [36]:
gen = gender_onehot.transform([gender_lab.transform([u_gender])])

In [37]:
sev = severity_onehot.transform([severity_lab.transform([u_severity])])

In [38]:
cont = contact_onehot.transform([contact_lab.transform([u_contact])])

In [39]:
test_data = np.array([[u_age,u_fever,u_bodypain,u_runny_nose,u_breath,u_nasal,u_throat]])

In [40]:
test_data

array([[23, 98,  0,  0,  0,  0,  0]])

In [41]:
test_x = np.c_[test_data,gen,sev,cont]

In [42]:
test_x.shape

(1, 16)

In [43]:
test_x = scaler.transform(test_x)

In [44]:
reg.predict_proba(test_x)

array([[9.99450613e-01, 5.49387022e-04]])

In [45]:
reg.predict(test_x)

array([0], dtype=int64)

In [46]:
from sklearn.metrics import classification_report

In [48]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95       322
           1       0.94      0.96      0.95       303

    accuracy                           0.95       625
   macro avg       0.95      0.95      0.95       625
weighted avg       0.95      0.95      0.95       625

