In [449]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer

In [450]:
df = pd.read_csv("https://raw.githubusercontent.com/justmarkham/pandas-videos/master/data/titanic_train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [451]:
X = df[["Sex", "Name"]]
y = df["Survived"]

In [452]:
vect = CountVectorizer()
ohe = OneHotEncoder()
ct = make_column_transformer(
    (ohe,["Sex"]),
    (vect, "Name")
)

In [453]:
knn_clf = KNeighborsClassifier(n_neighbors=2)
log_clf = LogisticRegression(solver= "liblinear", random_state=1)

In [454]:
log_pipe = make_pipeline(ct, log_clf)

knn_pipe = make_pipeline(ct, knn_clf)

## cross validate whole pipeline

In [455]:
# for log reg model
cross_val_score(log_pipe, X, y, cv=5, scoring='accuracy').mean()

0.8024543343167408

In [456]:
##for knn model
cross_val_score(knn_pipe, X, y, cv=5, scoring="accuracy").mean()

0.7564936287740882

# grid search 

In [457]:
##seperate with __
params = {}
params["columntransformer__countvectorizer__min_df"] = [1, 2]
params["logisticregression__penalty"] = ['l1', 'l2']
params["logisticregression__C"] = [0.1, 1, 10]
params

{'columntransformer__countvectorizer__min_df': [1, 2],
 'logisticregression__penalty': ['l1', 'l2'],
 'logisticregression__C': [0.1, 1, 10]}

In [458]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(estimator=log_pipe,
                   param_grid=params,
                   cv= 5,
                   scoring="accuracy")
grid.fit(X, y)
grid_cv = grid.cv_results_
grid_cv

{'mean_fit_time': array([0.01581168, 0.01700282, 0.01758618, 0.01678562, 0.02543292,
        0.01763239, 0.01598163, 0.01439419, 0.01503286, 0.01659145,
        0.03939462, 0.01639948]),
 'std_fit_time': array([0.00043266, 0.00089112, 0.00047953, 0.00071605, 0.00267416,
        0.00049617, 0.00214289, 0.00045398, 0.00065162, 0.00135108,
        0.00458951, 0.00149797]),
 'mean_score_time': array([0.00520234, 0.00498629, 0.00521789, 0.00541563, 0.00536981,
        0.00517859, 0.00501776, 0.00459595, 0.00517755, 0.00498848,
        0.00520716, 0.00641108]),
 'std_score_time': array([0.00097927, 0.00062832, 0.00040765, 0.00079266, 0.00049174,
        0.00074088, 0.0006136 , 0.00046689, 0.00038506, 0.0006501 ,
        0.0003978 , 0.00195174]),
 'param_columntransformer__countvectorizer__min_df': masked_array(data=[1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False],
        fill_val

In [459]:
# we had 0.80 accuracy for cross_val with default parameters
grid.best_score_

0.8215177954930638

In [460]:
grid.best_index_

2

In [461]:
grid.best_params_

{'columntransformer__countvectorizer__min_df': 1,
 'logisticregression__C': 1,
 'logisticregression__penalty': 'l1'}

In [462]:
pd.DataFrame(grid_cv)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_columntransformer__countvectorizer__min_df,param_logisticregression__C,param_logisticregression__penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.015812,0.000433,0.005202,0.000979,1,0.1,l1,{'columntransformer__countvectorizer__min_df':...,0.804469,0.803371,0.786517,0.752809,0.786517,0.786737,0.018667,9
1,0.017003,0.000891,0.004986,0.000628,1,0.1,l2,{'columntransformer__countvectorizer__min_df':...,0.804469,0.803371,0.786517,0.752809,0.786517,0.786737,0.018667,9
2,0.017586,0.00048,0.005218,0.000408,1,1.0,l1,{'columntransformer__countvectorizer__min_df':...,0.849162,0.814607,0.803371,0.786517,0.853933,0.821518,0.026142,1
3,0.016786,0.000716,0.005416,0.000793,1,1.0,l2,{'columntransformer__countvectorizer__min_df':...,0.815642,0.814607,0.803371,0.752809,0.825843,0.802454,0.025823,7
4,0.025433,0.002674,0.00537,0.000492,1,10.0,l1,{'columntransformer__countvectorizer__min_df':...,0.854749,0.808989,0.820225,0.780899,0.842697,0.821512,0.025928,3
5,0.017632,0.000496,0.005179,0.000741,1,10.0,l2,{'columntransformer__countvectorizer__min_df':...,0.832402,0.797753,0.814607,0.780899,0.848315,0.814795,0.023971,4
6,0.015982,0.002143,0.005018,0.000614,2,0.1,l1,{'columntransformer__countvectorizer__min_df':...,0.804469,0.803371,0.786517,0.752809,0.786517,0.786737,0.018667,9
7,0.014394,0.000454,0.004596,0.000467,2,0.1,l2,{'columntransformer__countvectorizer__min_df':...,0.804469,0.803371,0.786517,0.752809,0.786517,0.786737,0.018667,9
8,0.015033,0.000652,0.005178,0.000385,2,1.0,l1,{'columntransformer__countvectorizer__min_df':...,0.849162,0.814607,0.803371,0.786517,0.853933,0.821518,0.026142,1
9,0.016591,0.001351,0.004988,0.00065,2,1.0,l2,{'columntransformer__countvectorizer__min_df':...,0.821229,0.825843,0.803371,0.764045,0.837079,0.810313,0.025556,6


In [463]:
log_pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('onehotencoder', OneHotEncoder(), ['Sex']),
                                   ('countvectorizer', CountVectorizer(), 'Name')])),
  ('logisticregression',
   LogisticRegression(random_state=1, solver='liblinear'))],
 'verbose': False,
 'columntransformer': ColumnTransformer(transformers=[('onehotencoder', OneHotEncoder(), ['Sex']),
                                 ('countvectorizer', CountVectorizer(), 'Name')]),
 'logisticregression': LogisticRegression(random_state=1, solver='liblinear'),
 'columntransformer__n_jobs': None,
 'columntransformer__remainder': 'drop',
 'columntransformer__sparse_threshold': 0.3,
 'columntransformer__transformer_weights': None,
 'columntransformer__transformers': [('onehotencoder',
   OneHotEncoder(),
   ['Sex']),
  ('countvectorizer', CountVectorizer(), 'Name')],
 'columntransformer__verbose': False,
 'columntransformer__verbose_feature_names_out': True,
 'columntransfo

# random grid search

In [464]:
X = df["Name"]
y = df["Survived"]

In [465]:
vectorizer = CountVectorizer()
# vectorizer.fit_transform(X).toarray()

In [466]:
# ct = make_column_transformer(
#     (vectorizer,"Name"),
#     (vectorizer, "Sex")
# )
# ct.fit_transform(X).toarray()

In [467]:
from sklearn.naive_bayes import MultinomialNB

pipe = make_pipeline(vectorizer, MultinomialNB())

cross_val_score(pipe, X, y, cv=5, scoring="accuracy").mean()

0.8001820350260498

In [468]:
pipe.get_params()

{'memory': None,
 'steps': [('countvectorizer', CountVectorizer()),
  ('multinomialnb', MultinomialNB())],
 'verbose': False,
 'countvectorizer': CountVectorizer(),
 'multinomialnb': MultinomialNB(),
 'countvectorizer__analyzer': 'word',
 'countvectorizer__binary': False,
 'countvectorizer__decode_error': 'strict',
 'countvectorizer__dtype': numpy.int64,
 'countvectorizer__encoding': 'utf-8',
 'countvectorizer__input': 'content',
 'countvectorizer__lowercase': True,
 'countvectorizer__max_df': 1.0,
 'countvectorizer__max_features': None,
 'countvectorizer__min_df': 1,
 'countvectorizer__ngram_range': (1, 1),
 'countvectorizer__preprocessor': None,
 'countvectorizer__stop_words': None,
 'countvectorizer__strip_accents': None,
 'countvectorizer__token_pattern': '(?u)\\b\\w\\w+\\b',
 'countvectorizer__tokenizer': None,
 'countvectorizer__vocabulary': None,
 'multinomialnb__alpha': 1.0,
 'multinomialnb__class_prior': None,
 'multinomialnb__fit_prior': True}

In [469]:
import scipy as sp

params = {}
params["countvectorizer__min_df"] = [1, 2, 3, 4]
params["countvectorizer__lowercase"] = [True, False]
params["multinomialnb__alpha"] = sp.stats.uniform(scale=1)  # can use np.linspace or np.logspace
params

{'countvectorizer__min_df': [1, 2, 3, 4],
 'countvectorizer__lowercase': [True, False],
 'multinomialnb__alpha': <scipy.stats._distn_infrastructure.rv_frozen at 0x26ed9b28640>}

In [470]:
##randomly choose and eval parameters for n_iter times
from sklearn.model_selection import RandomizedSearchCV

rand = RandomizedSearchCV(estimator=pipe,
                  param_distributions=params,
                  n_iter=10,
                  cv=5,
                  scoring='accuracy',
#                   random_state=1
                         )

rand.fit(X, y)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('countvectorizer',
                                              CountVectorizer()),
                                             ('multinomialnb',
                                              MultinomialNB())]),
                   param_distributions={'countvectorizer__lowercase': [True,
                                                                       False],
                                        'countvectorizer__min_df': [1, 2, 3, 4],
                                        'multinomialnb__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000026ED9B28640>},
                   scoring='accuracy')

In [471]:
rand.best_score_

0.8035653756826313

In [472]:
rand.best_params_

{'countvectorizer__lowercase': True,
 'countvectorizer__min_df': 2,
 'multinomialnb__alpha': 0.6040524199825402}

In [473]:
results = pd.DataFrame(rand.cv_results_)
results.sort_values('rank_test_score')[["params","mean_test_score","rank_test_score"]]

Unnamed: 0,params,mean_test_score,rank_test_score
0,"{'countvectorizer__lowercase': True, 'countvec...",0.803565,1
6,"{'countvectorizer__lowercase': False, 'countve...",0.803565,1
4,"{'countvectorizer__lowercase': False, 'countve...",0.800201,3
8,"{'countvectorizer__lowercase': False, 'countve...",0.799071,4
7,"{'countvectorizer__lowercase': False, 'countve...",0.797966,5
3,"{'countvectorizer__lowercase': False, 'countve...",0.797947,6
9,"{'countvectorizer__lowercase': True, 'countvec...",0.796836,7
2,"{'countvectorizer__lowercase': False, 'countve...",0.796824,8
5,"{'countvectorizer__lowercase': True, 'countvec...",0.796818,9
1,"{'countvectorizer__lowercase': False, 'countve...",0.790095,10


In [474]:
results["params"].values

array([{'countvectorizer__lowercase': True, 'countvectorizer__min_df': 2, 'multinomialnb__alpha': 0.6040524199825402},
       {'countvectorizer__lowercase': False, 'countvectorizer__min_df': 4, 'multinomialnb__alpha': 0.7781056099172469},
       {'countvectorizer__lowercase': False, 'countvectorizer__min_df': 4, 'multinomialnb__alpha': 0.01770245913252977},
       {'countvectorizer__lowercase': False, 'countvectorizer__min_df': 4, 'multinomialnb__alpha': 0.16307274249774706},
       {'countvectorizer__lowercase': False, 'countvectorizer__min_df': 3, 'multinomialnb__alpha': 0.5166357457715272},
       {'countvectorizer__lowercase': True, 'countvectorizer__min_df': 2, 'multinomialnb__alpha': 0.3061893597776443},
       {'countvectorizer__lowercase': False, 'countvectorizer__min_df': 3, 'multinomialnb__alpha': 0.4179740395675551},
       {'countvectorizer__lowercase': False, 'countvectorizer__min_df': 3, 'multinomialnb__alpha': 0.7722755421602978},
       {'countvectorizer__lowercase': Fa

In [475]:
X = df[["Name","Sex"]]



ct = make_column_transformer(
    (vectorizer, "Name"),
    (vectorizer, "Sex")
)
ct.fit_transform(X, y)
ct.get_feature_names_out()

array(['countvectorizer-1__aaron', 'countvectorizer-1__abbing',
       'countvectorizer-1__abbott', ..., 'countvectorizer-1__zimmerman',
       'countvectorizer-2__female', 'countvectorizer-2__male'],
      dtype=object)