In [1]:
import pandas as pd

## Reading Data

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df.head()

Unnamed: 0,Fever,Bodypain,Age,RunnyNose,DiffBreathe,InfectionProb
0,99.75394,1,62,0,1,0
1,100.928936,0,90,1,0,0
2,100.150432,1,24,0,0,0
3,100.456116,1,53,1,-1,0
4,100.300234,0,10,0,-1,1


In [4]:
df.tail()

Unnamed: 0,Fever,Bodypain,Age,RunnyNose,DiffBreathe,InfectionProb
3494,99.883119,1,100,1,0,0
3495,98.085662,0,14,1,-1,0
3496,99.217976,0,51,0,1,1
3497,101.692228,1,16,1,0,0
3498,100.175195,1,40,1,1,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3499 entries, 0 to 3498
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Fever          3499 non-null   float64
 1   Bodypain       3499 non-null   int64  
 2   Age            3499 non-null   int64  
 3   RunnyNose      3499 non-null   int64  
 4   DiffBreathe    3499 non-null   int64  
 5   InfectionProb  3499 non-null   int64  
dtypes: float64(1), int64(5)
memory usage: 164.1 KB


In [6]:
df['Fever'].value_counts()

99.751317     1
101.332169    1
101.973708    1
100.032321    1
99.090564     1
             ..
98.761579     1
98.012368     1
101.817069    1
101.927834    1
99.915322     1
Name: Fever, Length: 3499, dtype: int64

In [7]:
df['DiffBreathe'].value_counts()

-1    1186
 1    1177
 0    1136
Name: DiffBreathe, dtype: int64

In [8]:
df.describe()

Unnamed: 0,Fever,Bodypain,Age,RunnyNose,DiffBreathe,InfectionProb
count,3499.0,3499.0,3499.0,3499.0,3499.0,3499.0
mean,100.022759,0.489854,49.847099,0.501286,-0.002572,0.494427
std,1.142203,0.499969,28.910991,0.50007,0.821902,0.50004
min,98.001875,0.0,1.0,0.0,-1.0,0.0
25%,99.025265,0.0,25.0,0.0,-1.0,0.0
50%,100.057115,0.0,49.0,1.0,0.0,0.0
75%,100.978437,1.0,75.0,1.0,1.0,1.0
max,101.999784,1.0,100.0,1.0,1.0,1.0


## Train Test Splitting

In [9]:
import numpy as np

In [10]:
def data_split(data, ratio):
    np.random.seed(42)
    shuffled = np.random.permutation(len(data))
    test_set_size = int(len(data) * ratio)
    test_indices = shuffled[:test_set_size]
    train_indices = shuffled[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [11]:
np.random.permutation(7)

array([0, 2, 5, 6, 1, 3, 4])

In [12]:
train, test= data_split(df, 0.2)

In [13]:
train

Unnamed: 0,Fever,Bodypain,Age,RunnyNose,DiffBreathe,InfectionProb
2213,98.623529,0,18,1,-1,1
162,100.702208,0,96,1,1,0
1001,101.429308,1,39,1,-1,1
1718,100.367729,0,71,1,0,1
1003,100.402332,0,83,0,1,0
...,...,...,...,...,...,...
1095,98.236457,0,83,0,1,1
1130,98.245584,0,39,0,-1,0
1294,101.838546,0,15,1,0,0
860,101.338719,0,45,0,-1,1


In [14]:
test

Unnamed: 0,Fever,Bodypain,Age,RunnyNose,DiffBreathe,InfectionProb
1650,98.793448,1,77,1,1,0
2456,99.944611,1,29,1,-1,0
2232,99.963846,1,10,0,0,0
1945,99.518249,0,42,1,1,1
309,101.295227,1,43,1,0,0
...,...,...,...,...,...,...
2250,100.226906,0,34,0,1,1
3127,98.059656,1,28,0,1,0
744,100.844843,0,30,1,0,1
631,98.057775,1,26,0,1,1


In [15]:
x_train = train[['Fever','Bodypain','Age','RunnyNose','DiffBreathe']].to_numpy()
x_test = test[['Fever','Bodypain','Age','RunnyNose','DiffBreathe']].to_numpy()

In [16]:
x_train

array([[ 98.6235291 ,   0.        ,  18.        ,   1.        ,
         -1.        ],
       [100.702208  ,   0.        ,  96.        ,   1.        ,
          1.        ],
       [101.4293078 ,   1.        ,  39.        ,   1.        ,
         -1.        ],
       ...,
       [101.8385464 ,   0.        ,  15.        ,   1.        ,
          0.        ],
       [101.3387191 ,   0.        ,  45.        ,   0.        ,
         -1.        ],
       [ 98.48722764,   1.        ,  39.        ,   1.        ,
          0.        ]])

In [17]:
x_test

array([[ 98.7934479 ,   1.        ,  77.        ,   1.        ,
          1.        ],
       [ 99.94461134,   1.        ,  29.        ,   1.        ,
         -1.        ],
       [ 99.96384614,   1.        ,  10.        ,   0.        ,
          0.        ],
       ...,
       [100.8448431 ,   0.        ,  30.        ,   1.        ,
          0.        ],
       [ 98.05777486,   1.        ,  26.        ,   0.        ,
          1.        ],
       [101.6755768 ,   0.        ,  10.        ,   0.        ,
          1.        ]])

In [18]:
y_train = train[['InfectionProb']].to_numpy().reshape(2800,)
y_test = test[['InfectionProb']].to_numpy().reshape(699,)

In [19]:
y_train

array([1, 0, 1, ..., 0, 1, 1], dtype=int64)

In [20]:
y_test

array([0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
clf = LogisticRegression()
clf.fit(x_train, y_train)

LogisticRegression()

In [23]:
inputFeatures = [100, 1, 22, -1, 1]
infProb = clf.predict_proba([inputFeatures])[0][1]

In [24]:
infProb

0.4258531643384824