<h1> Example 1 </h1>

In [91]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

In [92]:
df = pd.read_csv('covid_toy.csv')
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [93]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [94]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['has_covid']), df['has_covid'], test_size=0.2, random_state=10)
X_train.head(20)

Unnamed: 0,age,gender,fever,cough,city
90,59,Female,99.0,Strong,Delhi
70,68,Female,101.0,Strong,Delhi
50,19,Male,101.0,Mild,Delhi
44,20,Male,102.0,Strong,Delhi
76,80,Male,100.0,Mild,Bangalore
55,81,Female,101.0,Mild,Mumbai
21,73,Male,98.0,Mild,Bangalore
61,81,Female,98.0,Strong,Mumbai
6,14,Male,101.0,Strong,Bangalore
63,10,Male,100.0,Mild,Bangalore


In [95]:
df.shape, X_train.shape, X_test.shape

((100, 6), (80, 5), (20, 5))

In [96]:
X_train.nunique()

age       49
gender     2
fever      7
cough      2
city       4
dtype: int64

In [97]:
df['cough'].unique()

array(['Mild', 'Strong'], dtype=object)

In [98]:
df['city'].unique()

array(['Kolkata', 'Delhi', 'Mumbai', 'Bangalore'], dtype=object)

In [99]:
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(transformers=[
    ('fever-imputation', SimpleImputer(), ['fever']),
    ('cough-OrdinalTransformation', OrdinalEncoder(categories=[['Mild', 'Strong']]), ['cough']),
    ('gender-city-OHE', OneHotEncoder(drop='first'), ['gender', 'city'])    
], remainder='passthrough')

X_transformed = ct.fit_transform(X_train)
X_transformed_df = pd.DataFrame(X_transformed)
X_transformed_df.head(20)

Unnamed: 0,0,1,2,3,4,5,6
0,99.0,1.0,0.0,1.0,0.0,0.0,59.0
1,101.0,1.0,0.0,1.0,0.0,0.0,68.0
2,101.0,0.0,1.0,1.0,0.0,0.0,19.0
3,102.0,1.0,1.0,1.0,0.0,0.0,20.0
4,100.0,0.0,1.0,0.0,0.0,0.0,80.0
5,101.0,0.0,0.0,0.0,0.0,1.0,81.0
6,98.0,0.0,1.0,0.0,0.0,0.0,73.0
7,98.0,1.0,0.0,0.0,0.0,1.0,81.0
8,101.0,1.0,1.0,0.0,0.0,0.0,14.0
9,100.0,0.0,1.0,0.0,0.0,0.0,10.0


<h1> Example 2 </h1>

In [100]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [101]:
X = df.drop(columns=['Survived', 'PassengerId', 'Name', 'Ticket'])
y = df['Survived']

In [102]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,34.5,0,0,7.8292,,Q
1,3,female,47.0,1,0,7.0,,S
2,2,male,62.0,0,0,9.6875,,Q
3,3,male,27.0,0,0,8.6625,,S
4,3,female,22.0,1,1,12.2875,,S


In [103]:
X.nunique()


Pclass        3
Sex           2
Age          79
SibSp         7
Parch         8
Fare        169
Cabin        76
Embarked      3
dtype: int64

In [104]:
X.isnull().sum()

Pclass        0
Sex           0
Age          86
SibSp         0
Parch         0
Fare          1
Cabin       327
Embarked      0
dtype: int64

In [105]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression

In [106]:
numerical_features = ['Age', 'Fare']
numeric_transformer = Pipeline(steps=[ 
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])


categorical_features = ["Embarked", "Sex", "Pclass"]
categoric_transformer = OneHotEncoder(handle_unknown='ignore')

ct = ColumnTransformer(transformers=[ 
    ("num", numeric_transformer, numerical_features),
    ("cat", categoric_transformer, categorical_features)
])

clf = Pipeline(steps=[ 
    ("Preprocessimg data", ct),
    ("classifier", LogisticRegression())
]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
clf.score(X_test, y_test)

1.0

In [108]:
from sklearn import set_config
set_config(display='diagram')
clf