In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv("data.csv")

In [3]:
df

Unnamed: 0,fever,bodyPain,age,runnyNose,diffbreath,infectionProb
0,98.455335,1,10,0,0,1
1,102.425956,1,83,0,0,0
2,100.752712,1,92,1,0,0
3,101.708475,0,41,1,1,0
4,98.827673,1,33,1,1,1
5,101.435114,0,66,1,0,0
6,98.514040,1,30,0,0,1
7,102.859176,0,45,0,0,1
8,98.568122,1,43,1,-1,0
9,101.929031,0,36,0,1,0


In [4]:
df.describe()

Unnamed: 0,fever,bodyPain,age,runnyNose,diffbreath,infectionProb
count,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0
mean,100.987447,0.50405,50.643064,0.493549,0.0009,0.49995
std,1.712371,0.500009,28.736591,0.499983,0.814022,0.500025
min,98.000288,0.0,1.0,0.0,-1.0,0.0
25%,99.498738,0.0,26.0,0.0,-1.0,0.0
50%,101.012088,1.0,51.0,0.0,0.0,0.0
75%,102.472299,1.0,76.0,1.0,1.0,1.0
max,103.99909,1.0,100.0,1.0,1.0,1.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 6 columns):
fever            9999 non-null float64
bodyPain         9999 non-null int64
age              9999 non-null int64
runnyNose        9999 non-null int64
diffbreath       9999 non-null int64
infectionProb    9999 non-null int64
dtypes: float64(1), int64(5)
memory usage: 468.8 KB


In [6]:
df['diffbreath'].value_counts()

 0    3374
 1    3317
-1    3308
Name: diffbreath, dtype: int64

In [7]:
def data_split(data,ratio):
    np.random.seed(42)
    shuffled=np.random.permutation(len(data))
    test_set_size=int(len(data)*ratio)
    test_indices=shuffled[:test_set_size]
    train_indices=shuffled[test_set_size:]
    return data.iloc[train_indices],data.iloc[test_indices]

train,test= data_split(df,0.2)
    

In [8]:
train

Unnamed: 0,fever,bodyPain,age,runnyNose,diffbreath,infectionProb
8871,102.877607,1,92,0,1,0
9253,101.297000,0,15,1,-1,1
1561,100.218177,1,34,0,0,0
1670,99.719041,0,52,1,0,1
6086,101.004494,1,77,1,1,1
7332,98.935107,1,59,0,1,1
7461,100.868667,0,74,0,0,1
8828,102.425361,1,46,1,-1,0
7944,103.234584,1,65,0,-1,0
3508,103.570988,0,71,1,-1,1


In [9]:
test

Unnamed: 0,fever,bodyPain,age,runnyNose,diffbreath,infectionProb
5344,99.622645,1,59,1,-1,1
7444,98.699378,0,21,1,0,1
1731,101.817660,0,3,0,0,0
8719,102.125438,0,11,0,-1,1
4521,102.890043,1,10,1,1,1
7453,98.029316,0,3,1,0,0
576,102.664674,0,92,1,-1,1
7428,102.068266,1,71,0,-1,0
5577,102.589297,0,67,0,-1,0
439,100.632457,0,29,0,1,0


In [10]:
X_train=train[['fever','bodyPain','age','runnyNose','diffbreath']].to_numpy()
X_test=test[['fever','bodyPain','age','runnyNose','diffbreath']].to_numpy()



In [11]:
Y_train=train[['infectionProb']].to_numpy().reshape(8000,)
Y_test=test[['infectionProb']].to_numpy().reshape(1999,)

In [12]:
X_train

array([[102.8776074 ,   1.        ,  92.        ,   0.        ,
          1.        ],
       [101.2970003 ,   0.        ,  15.        ,   1.        ,
         -1.        ],
       [100.2181768 ,   1.        ,  34.        ,   0.        ,
          0.        ],
       ...,
       [102.5981515 ,   0.        ,  49.        ,   1.        ,
          1.        ],
       [100.2879928 ,   1.        ,   2.        ,   0.        ,
          0.        ],
       [ 99.00359973,   0.        ,  67.        ,   1.        ,
          0.        ]])

In [13]:
X_test

array([[ 99.62264531,   1.        ,  59.        ,   1.        ,
         -1.        ],
       [ 98.6993782 ,   0.        ,  21.        ,   1.        ,
          0.        ],
       [101.8176603 ,   0.        ,   3.        ,   0.        ,
          0.        ],
       ...,
       [100.7853877 ,   1.        ,  63.        ,   1.        ,
         -1.        ],
       [ 98.10687683,   1.        ,  62.        ,   1.        ,
          1.        ],
       [102.1176115 ,   0.        ,  97.        ,   1.        ,
          1.        ]])

In [14]:
Y_train

array([0, 1, 0, ..., 1, 1, 0])

In [15]:
Y_test

array([1, 1, 0, ..., 1, 0, 0])

In [16]:
logreg=LogisticRegression()

In [17]:
logreg.fit(X_train,Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
input_features=[104,1,22,1,0]
infProb = logreg.predict_proba([input_features])[0][1]

In [21]:
infProb

0.5182730144520166