Citation: https://www.kaggle.com/code/baghern/a-deep-dive-into-sklearn-pipelines \
But with a lot of modifications! e.g throwing out the performance

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

df = pd.read_csv('./data/train.csv')

df.dropna(axis=0)
df.set_index('id', inplace = True)

df.head()

Unnamed: 0_level_0,text,author
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id26305,"This process, however, afforded me no means of...",EAP
id17569,It never once occurred to me that the fumbling...,HPL
id11008,"In his left hand was a gold snuff box, from wh...",EAP
id27763,How lovely is spring As we looked from Windsor...,MWS
id12958,"Finding nothing else, not even gold, the Super...",HPL


In [2]:
import re
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english'))

#creating a function to encapsulate preprocessing, to mkae it easy to replicate on  submission data
def processing(df):
    #lowering and removing punctuation
    df['processed'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]','', x.lower()))
    
    #numerical feature engineering
    #total length of sentence
    df['length'] = df['processed'].apply(lambda x: len(x))
    #get number of words
    df['words'] = df['processed'].apply(lambda x: len(x.split(' ')))
    df['words_not_stopword'] = df['processed'].apply(lambda x: len([t for t in x.split(' ') if t not in stopWords]))
    #get the average word length
    df['avg_word_length'] = df['processed'].apply(lambda x: np.mean([len(t) for t in x.split(' ') if t not in stopWords]) if len([len(t) for t in x.split(' ') if t not in stopWords]) > 0 else 0)
    #get the average word length
    df['commas'] = df['text'].apply(lambda x: x.count(','))

    return(df)

df = processing(df)

df.head()

Unnamed: 0_level_0,text,author,processed,length,words,words_not_stopword,avg_word_length,commas
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
id26305,"This process, however, afforded me no means of...",EAP,this process however afforded me no means of a...,224,41,21,6.380952,4
id17569,It never once occurred to me that the fumbling...,HPL,it never once occurred to me that the fumbling...,70,14,6,6.166667,0
id11008,"In his left hand was a gold snuff box, from wh...",EAP,in his left hand was a gold snuff box from whi...,195,36,19,5.947368,4
id27763,How lovely is spring As we looked from Windsor...,MWS,how lovely is spring as we looked from windsor...,202,34,21,6.47619,3
id12958,"Finding nothing else, not even gold, the Super...",HPL,finding nothing else not even gold the superin...,170,27,16,7.1875,2


In [3]:
from sklearn.model_selection import train_test_split

features= [c for c in df.columns.values if c  not in ['id','text','author']]
numeric_features= [c for c in df.columns.values if c  not in ['id','text','author','processed']]
target = 'author'

X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.33, random_state=42)
X_train.head()

Unnamed: 0_level_0,processed,length,words,words_not_stopword,avg_word_length,commas
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
id19417,this panorama is indeed glorious and i should ...,91,18,6,6.666667,1
id09522,there was a simple natural earnestness about h...,240,44,18,6.277778,4
id22732,who are you pray that i duc de lomelette princ...,387,74,38,5.552632,9
id10351,he had gone in the carriage to the nearest tow...,118,24,11,5.363636,0
id24580,there is no method in their proceedings beyond...,71,13,5,7.0,1


In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import LatentDirichletAllocation

text = Pipeline([
                ('selector', TextSelector(key='processed')),
                ('tfidf', TfidfVectorizer( stop_words='english'))
                
            ])

pipeline = Pipeline([
    ('text',text),  
])

X_transformed = pipeline.fit_transform(X_train)
print("**1", X_transformed.shape, X_train.shape)
 

pipeline = Pipeline([
    ('text',text),
    ('lda', LatentDirichletAllocation(n_components=8, learning_method='online', 
                                          random_state=0, verbose=0, n_jobs = -1) )
])

X_transformed = pipeline.fit_transform(X_train)
print("**2", X_transformed.shape, X_train.shape)
 
pipeline = Pipeline([
    ('text',text),
    ('lda', LatentDirichletAllocation(n_components=8, learning_method='online', 
                                          random_state=0, verbose=0, n_jobs = -1) ),
    ('classifier', RandomForestClassifier(random_state = 42)),
])

import time
start_time = time.time()
pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
np.mean(preds == y_test)
print("Total Time:", time.time()-start_time)

**1 (13117, 21516) (13117, 6)
**2 (13117, 8) (13117, 6)
Total Time: 32.98590350151062


### GridSearchCV 

In [6]:
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'text', 'lda', 'classifier', 'text__memory', 'text__steps', 'text__verbose', 'text__selector', 'text__tfidf', 'text__selector__key', 'text__tfidf__analyzer', 'text__tfidf__binary', 'text__tfidf__decode_error', 'text__tfidf__dtype', 'text__tfidf__encoding', 'text__tfidf__input', 'text__tfidf__lowercase', 'text__tfidf__max_df', 'text__tfidf__max_features', 'text__tfidf__min_df', 'text__tfidf__ngram_range', 'text__tfidf__norm', 'text__tfidf__preprocessor', 'text__tfidf__smooth_idf', 'text__tfidf__stop_words', 'text__tfidf__strip_accents', 'text__tfidf__sublinear_tf', 'text__tfidf__token_pattern', 'text__tfidf__tokenizer', 'text__tfidf__use_idf', 'text__tfidf__vocabulary', 'lda__batch_size', 'lda__doc_topic_prior', 'lda__evaluate_every', 'lda__learning_decay', 'lda__learning_method', 'lda__learning_offset', 'lda__max_doc_update_iter', 'lda__max_iter', 'lda__mean_change_tol', 'lda__n_components', 'lda__n_jobs', 'lda__perp_tol', 'lda__random_state', '

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

def my_scorer(y_true, y_predicted):
    # https://stackoverflow.com/questions/46208221/scikit-learn-gridsearch-custom-scoring-function
    error = np.mean(y_true == y_predicted)
    return error

scoring_func = make_scorer(my_scorer, greater_is_better=True)


hyperparameters = { 'text__tfidf__max_df': [0.9, 0.95],
                    'text__tfidf__ngram_range': [(1,1), (1,2)],
                   'lda__n_components': [5],
                    'classifier__min_samples_leaf': [1,2]
                  }
#                    'classifier__max_depth': [50, 70],
start_time = time.time()
clf = GridSearchCV(pipeline, hyperparameters, cv=5, scoring=scoring_func)
# Fit and tune model
clf.fit(X_train, y_train)

In [11]:
print("Total Time:", time.time()-start_time)

Total Time: 1934.9938127994537


In [12]:
clf.best_params_

{'classifier__min_samples_leaf': 2,
 'lda__n_components': 5,
 'text__tfidf__max_df': 0.9,
 'text__tfidf__ngram_range': (1, 1)}

In [13]:
#refitting on entire training data using best settings
clf.refit

preds = clf.predict(X_test)
probs = clf.predict_proba(X_test)

np.mean(preds == y_test)

0.4025069637883008