In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression

  import pandas.util.testing as tm


In [2]:
sklearn.__version__

'0.23.1'

In [3]:
df_diamonds = sns.load_dataset('diamonds')
df_diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [0]:
X = df_diamonds[['carat', 'depth']]
y = df_diamonds['price']

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=132)

In [0]:
lr = LinearRegression()

In [7]:
lr.fit(X_train, y_train)

LinearRegression()

In [0]:
y_pred = lr.predict(X_test)

In [0]:
from sklearn.metrics import confusion_matrix

In [12]:
confusion_matrix(y_pred, y_test)

ValueError: ignored

In [13]:
y_pred

array([ 5844.22846379,  1571.06392779,   294.35740727, ...,
        5670.82581921,  6977.06867137, 14522.03484142])

In [14]:
y_test

13303     5495
47247     1842
26720      645
51263     2357
18296     7437
         ...  
50132     2211
4215      3568
8579      4436
9225      4558
26275    15760
Name: price, Length: 13485, dtype: int64

## Classification example: Titanic

In [20]:
df_titanic = sns.load_dataset('titanic')
df_titanic.head(2)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False


In [0]:
X = df_titanic.loc[:, ['sex', 'fare', 'class']]
y = df_titanic['survived']
X = X.replace(dict(male=0, female=1))


In [22]:
X.head()

Unnamed: 0,sex,fare,class
0,0,7.25,Third
1,1,71.2833,First
2,1,7.925,Third
3,1,53.1,First
4,0,8.05,Third


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)
X_train[:2]

Unnamed: 0,sex,fare,class
199,1,13.0,Second
129,0,6.975,Third


In [26]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 668 entries, 199 to 168
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   sex     668 non-null    int64   
 1   fare    668 non-null    float64 
 2   class   668 non-null    category
dtypes: category(1), float64(1), int64(1)
memory usage: 16.4 KB


In [31]:
X_train['class'].value_counts()

Third     365
First     162
Second    141
Name: class, dtype: int64

In [0]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

In [33]:
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
y_train.tail()

663    0
664    0
665    0
666    0
667    0
Name: survived, dtype: int64

In [0]:
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_train_class = ohe.fit_transform(X_train['class'].to_numpy().reshape(-1,1))
X_test_class = ohe.transform(X_test['class'].to_numpy().reshape(-1,1))

In [35]:
X_train_class.shape

(668, 3)

In [37]:
X_train_class_df = pd.DataFrame(
    X_train_class,
    columns=ohe.get_feature_names()
)
X_train_class_df.head()

Unnamed: 0,x0_First,x0_Second,x0_Third
0,0.0,1.0,0.0
1,0.0,0.0,1.0
2,0.0,0.0,1.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0


In [0]:
X_train_transformed = pd.concat(
    [X_train.loc[:, ['sex', 'fare']],
     X_train_class_df],
     axis=1
)

In [39]:
X_train_transformed

Unnamed: 0,sex,fare,x0_First,x0_Second,x0_Third
0,1,13.0000,0.0,1.0,0.0
1,0,6.9750,0.0,0.0,1.0
2,0,8.0500,0.0,0.0,1.0
3,1,83.4750,1.0,0.0,0.0
4,0,7.7500,0.0,0.0,1.0
...,...,...,...,...,...
663,1,8.6625,0.0,0.0,1.0
664,0,8.7125,0.0,0.0,1.0
665,0,49.5042,1.0,0.0,0.0
666,0,221.7792,1.0,0.0,0.0


In [40]:
X_test_class_df = pd.DataFrame(
    X_test_class,
    columns = ohe.get_feature_names()
)

X_test_class_df.head()

Unnamed: 0,x0_First,x0_Second,x0_Third
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0


In [0]:
X_test_transformed = pd.concat(
    [X_test.loc[:, ['sex', 'fare']],
     X_test_class_df],
     axis=1
)

In [42]:
X_test_transformed.head()

Unnamed: 0,sex,fare,x0_First,x0_Second,x0_Third
0,0,26.2875,1.0,0.0,0.0
1,0,8.05,0.0,0.0,1.0
2,1,65.0,0.0,1.0,0.0
3,0,56.4958,0.0,0.0,1.0
4,1,7.925,0.0,0.0,1.0


In [43]:
logreg = LogisticRegression()
logreg.fit(X_train_transformed, y_train)

LogisticRegression()

In [44]:
logreg.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [45]:
logreg.score(X_test_transformed, y_test)

0.7713004484304933

In [0]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [0]:
ct = make_column_transformer(
    (StandardScaler(), ['sex']),
    (OneHotEncoder(), ['class']),
    remainder = 'passthrough'
)

In [48]:
ct.get_feature_names

<bound method ColumnTransformer.get_feature_names of ColumnTransformer(remainder='passthrough',
                  transformers=[('standardscaler', StandardScaler(), ['sex']),
                                ('onehotencoder', OneHotEncoder(), ['class'])])>

In [49]:
pipe = make_pipeline(ct, LogisticRegression())
pipe.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('standardscaler',
                                                  StandardScaler(), ['sex']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['class'])])),
                ('logisticregression', LogisticRegression())])

In [50]:
pipe.n_features_in_

3

In [51]:
pipe.named_steps

{'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('standardscaler', StandardScaler(), ['sex']),
                                 ('onehotencoder', OneHotEncoder(), ['class'])]),
 'logisticregression': LogisticRegression()}

In [52]:
pipe.named_steps['columntransformer'].transformers_[1][1].get_feature_names()

array(['x0_First', 'x0_Second', 'x0_Third'], dtype=object)

# The new interactive DAG

In [0]:
sklearn.set_config(display='diagram')

In [54]:
pipe = make_pipeline(ct, LogisticRegression())
pipe.fit(X_train, y_train)

In [0]:
sklearn.set_config(display='text')


In [56]:
pipe.named_steps


{'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('standardscaler', StandardScaler(), ['sex']),
                                 ('onehotencoder', OneHotEncoder(), ['class'])]),
 'logisticregression': LogisticRegression()}

In [57]:
pipe.get_params()


{'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('standardscaler', StandardScaler(), ['sex']),
                                 ('onehotencoder', OneHotEncoder(), ['class'])]),
 'columntransformer__n_jobs': None,
 'columntransformer__onehotencoder': OneHotEncoder(),
 'columntransformer__onehotencoder__categories': 'auto',
 'columntransformer__onehotencoder__drop': None,
 'columntransformer__onehotencoder__dtype': numpy.float64,
 'columntransformer__onehotencoder__handle_unknown': 'error',
 'columntransformer__onehotencoder__sparse': True,
 'columntransformer__remainder': 'passthrough',
 'columntransformer__sparse_threshold': 0.3,
 'columntransformer__standardscaler': StandardScaler(),
 'columntransformer__standardscaler__copy': True,
 'columntransformer__standardscaler__with_mean': True,
 'columntransformer__standardscaler__with_std': True,
 'columntransformer__transformer_weights': None,
 'columntransformer__transformers': [('standards

In [58]:
pipe.score(X_test, y_test)


0.7713004484304933

In [59]:
pipe.named_steps['logisticregression']


LogisticRegression()