## Grid Search coupling parameters from a text documents feature extractor (n-gram count vectorizer and TF-IDF transformer) 

Source: 
https://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html#sphx-glr-auto-examples-model-selection-grid-search-text-feature-extraction-py

In [2]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
from nltk import sent_tokenize
from nltk.corpus import stopwords
stopwords = nltk.corpus.stopwords.words('english')
from nltk.tokenize import word_tokenize

import string
import joblib
import re
# set seed
np.random.seed(42)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
train_df = pd.read_csv('https://raw.githubusercontent.com/smkerr/COVID-fake-news-detection/main/data/Constraint_Train.csv', header=0)
val_df = pd.read_csv('https://raw.githubusercontent.com/smkerr/COVID-fake-news-detection/main/data/Constraint_Val.csv', header=0)

In [4]:
stopwords = nltk.corpus.stopwords.words('english')

def cleantext(string):
    text = string.lower().split()
    text = " ".join(text)
    text = re.sub(r"http(\S)+",' ',text)    
    text = re.sub(r"www(\S)+",' ',text)
    text = re.sub(r"&",' and ',text)  
    tx = text.replace('&amp',' ')
    text = re.sub(r"[^a-zA-Z]+",' ',text)
    text = text.split()
    text = [w for w in text if not w in stopwords]
    text = " ".join(text)
    return text

In [5]:
train_df['tweet'] = train_df['tweet'].map(lambda x: cleantext(x))
val_df['tweet'] = val_df['tweet'].map(lambda x: cleantext(x))

In [50]:
from pprint import pprint
from time import time
import logging

#from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline


# Display progress logs on stdout
#logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")

pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clf", SGDClassifier()),
    ]
)

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    #"vect__max_df": (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    "vect__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False), 
    #'tfidf__norm': ('l1', 'l2'),
    #"clf__max_iter": (20,),
    #"clf__alpha": (0.00001, 0.000001),
    #"clf__penalty": ("l2", "elasticnet"),
    #'clf__max_iter': (10, 50, 80),
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring="f1_micro" )

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(train_df['tweet'],train_df['label'])
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    grid_search.cv_results_

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'tfidf__use_idf': (True, False), 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 4 candidates, totalling 20 fits
done in 6.237s

Best score: 0.934
Best parameters set:
	tfidf__use_idf: True
	vect__ngram_range: (1, 2)


### Results

The best parameters set is:

*   tfidf__use_idf: True (Which is the default)
*   Lvect__ngram_range: (1, 2) (means unigrams and bigrams)