In [36]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('C:\\Users\\User\\Downloads\\Data\\covid_toy.csv')

In [None]:
data.head()

In [None]:
data.isnull().sum()

# Now we will create the train and test sets by train_test_split

In [21]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data.iloc[:, :5], data.has_covid, test_size=0.3, random_state=20)

# We found four different categorical column here. We will Encode them by,

# > 'gender' and 'city' : this two colums are Nominal, so we will use OneHotEncoder.
# > 'cough' : this column is Ordinal, so we will use OrdinalEncoder.
# > 'fever' : this column has 10 NaN values, so we will use SimpleImputer to fill them by mean value.
# > 'has_covid' : this colum is the output column, so we will use LabelEncoder

In [22]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [23]:
transformer = ColumnTransformer([
    ('encoder1', SimpleImputer(), ['fever']),
    ('encoder2', OrdinalEncoder(categories=[['Mild', 'Strong']]), ['cough']),
    ('encoder3', OneHotEncoder(drop='first', sparse_output=True), ['gender', 'city'])
], remainder='passthrough')

In [24]:
x_train_new = transformer.fit_transform(x_train)
x_test_new = transformer.transform(x_test)

# The fit and fit_transform method in the LabelEncoder only accepts one argument: fit(y) and fit_transform(y). Therefore, LabelEncoder couldn't be used inside a Pipeline or a ColumnTransform.Dec 4, 2018

# Thats why we are going to use LabelEncoder in the output/y or 'has_covid' column seperately

In [25]:
from sklearn.preprocessing import LabelEncoder

In [26]:
le = LabelEncoder()

In [27]:
y_train_new = le.fit_transform(y_train)
y_test_new = le.transform(y_test)

# Now we will create a Linear Regression model and fit the model with our data

In [28]:
from sklearn.tree import DecisionTreeClassifier

In [29]:
my_model = DecisionTreeClassifier()

In [30]:
my_model.fit(x_train_new, y_train_new)

# Now we have to predict the output by our created model

In [31]:
y_pred = my_model.predict(x_test_new)

In [32]:
y_pred

array([0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 1])

# Now it's time to findout the accuracy of our prediction

In [33]:
from sklearn.metrics import accuracy_score

In [34]:
accuracy_score(y_pred,y_test_new)

0.4

In [38]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
cross_val_score(LogisticRegression(), x_train_new, y_train_new, cv=10, scoring='accuracy').mean()

0.2857142857142857