<a href="https://colab.research.google.com/github/shiwangi27/googlecolab/blob/main/Text_Classifier_BOW_Sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
from collections import Counter
import pandas as pd

In [None]:
texts = [
  "The cat chased the mouse.", 
  "If jerry was a mouse, tom was a cat", 
  "Are you a cat lady or a plant lady", 
  "Are you a dog person?", 
  "I had an ugly pup named quasi-moto", 
  "And daisy liked pigeons!"
]

labels = [0, 1, 0, 1, 1, 0]

In [None]:
def preprocess(text):
  text = text.lower()
  text = re.sub("[\?\!\.\'\"\_]", "", text)
  return text

def create_ngrams(text, n):
  # I am an animal
  words = text.split(" ")
  ngrams = []
  for i in range(len(words)-n+1):
    _n_gram = tuple(words[i+j] for j in range(n))
    ngrams.append(_n_gram)
  return ngrams

def get_ngram_counts(texts, n):
  ngram_counts = Counter()
  for text in texts:
    ngrams = create_ngrams(text, n)
    ngram_counts.update(ngrams)
  return ngram_counts

def get_vocab(texts, ngram=None):
  all_texts = " ".join([text for text in texts])
  words = all_texts.split(" ")
  word_counts = Counter(words)
  unique_words = word_counts.keys()
  vocab = {w:i+1 for i, w in enumerate(unique_words)}
  vocab.update({"<UNK>": 0})
  return word_counts, vocab
  

In [None]:
cleaned_texts = list(map(preprocess, texts))
cleaned_texts

['the cat chased the mouse',
 'if jerry was a mouse, tom was a cat',
 'are you a cat lady or a plant lady',
 'are you a dog person',
 'i had an ugly pup named quasi-moto',
 'and daisy liked pigeons']

In [None]:
word_counts, vocab = get_vocab(cleaned_texts)

In [None]:
vocab

{'<UNK>': 0,
 'a': 8,
 'an': 20,
 'and': 25,
 'are': 11,
 'cat': 2,
 'chased': 3,
 'daisy': 26,
 'dog': 16,
 'had': 19,
 'i': 18,
 'if': 5,
 'jerry': 6,
 'lady': 13,
 'liked': 27,
 'mouse': 4,
 'mouse,': 9,
 'named': 23,
 'or': 14,
 'person': 17,
 'pigeons': 28,
 'plant': 15,
 'pup': 22,
 'quasi-moto': 24,
 'the': 1,
 'tom': 10,
 'ugly': 21,
 'was': 7,
 'you': 12}

In [None]:
bigrams = get_ngram_counts(cleaned_texts, 2)

In [None]:
bigrams

Counter({('a', 'cat'): 2,
         ('a', 'dog'): 1,
         ('a', 'mouse,'): 1,
         ('a', 'plant'): 1,
         ('an', 'ugly'): 1,
         ('and', 'daisy'): 1,
         ('are', 'you'): 2,
         ('cat', 'chased'): 1,
         ('cat', 'lady'): 1,
         ('chased', 'the'): 1,
         ('daisy', 'liked'): 1,
         ('dog', 'person'): 1,
         ('had', 'an'): 1,
         ('i', 'had'): 1,
         ('if', 'jerry'): 1,
         ('jerry', 'was'): 1,
         ('lady', 'or'): 1,
         ('liked', 'pigeons'): 1,
         ('mouse,', 'tom'): 1,
         ('named', 'quasi-moto'): 1,
         ('or', 'a'): 1,
         ('plant', 'lady'): 1,
         ('pup', 'named'): 1,
         ('the', 'cat'): 1,
         ('the', 'mouse'): 1,
         ('tom', 'was'): 1,
         ('ugly', 'pup'): 1,
         ('was', 'a'): 2,
         ('you', 'a'): 2})

In [None]:
X = ["I want to adopt a cat or a dog."]

In [None]:
X_clean = preprocess(X[0])

In [None]:
X_words = X_clean.split(" ")
X_words

['i', 'want', 'to', 'adopt', 'a', 'cat', 'or', 'a', 'dog']

In [None]:
X_bigrams = create_ngrams(X_clean, 2)
X_bigrams

[('i', 'want'),
 ('want', 'to'),
 ('to', 'adopt'),
 ('adopt', 'a'),
 ('a', 'cat'),
 ('cat', 'or'),
 ('or', 'a'),
 ('a', 'dog')]

## CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [None]:
def tokenizer(text):
  return text.split()

In [None]:
count_vectorizer = CountVectorizer(analyzer="word", 
                                   max_df=0.5,
                                   min_df=0.01,
                                   preprocessor=preprocess,
                                   tokenizer=tokenizer,
                                   max_features=20,
                                   ngram_range=(1, 2)
                                   )

In [None]:
count_vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.5, max_features=20, min_df=0.01,
                ngram_range=(1, 2),
                preprocessor=<function preprocess at 0x7f14a7b60ef0>,
                stop_words=None, strip_accents=None,
                token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function tokenizer at 0x7f1498b50440>,
                vocabulary=None)

In [None]:
count_vectors = count_vectorizer.fit_transform(texts) 

In [None]:
tf = count_vectors.toarray()
tf

array([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0],
       [2, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0],
       [2, 1, 1, 1, 1, 2, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1],
       [1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]])

In [None]:
count_df = pd.DataFrame(tf)

In [None]:
col_vals = count_df.apply(lambda x: x > 0, axis=1)

document_freqs = {}
for col in col_vals.columns:
  doc_freq = len(col_vals[col_vals[col] == True])
  document_freqs[col] = doc_freq

In [None]:
document_freqs

{0: 3,
 1: 2,
 2: 2,
 3: 2,
 4: 3,
 5: 1,
 6: 1,
 7: 1,
 8: 1,
 9: 1,
 10: 1,
 11: 1,
 12: 1,
 13: 1,
 14: 1,
 15: 1,
 16: 1,
 17: 1,
 18: 2,
 19: 2}

In [None]:
def calculate_tfidf(tf, i, document_freqs):
  return tf * (np.log((1 + len(document_freqs)) / (1 + document_freqs[i])) + 1)

def l2_normalizer(row):
  l2_norm = np.sqrt(np.sum([r*r for r in row]))
  norm_vals = []
  for r in row:
    norm_vals.append(r / l2_norm)
  return pd.Series(norm_vals)

In [None]:
tfidf_df = count_df.apply(lambda w: calculate_tfidf(w, w.name, document_freqs)) 
tfidf_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.0,0.0,0.0,0.0,2.658228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.702751,0.0,0.0,0.0,0.0
1,5.316456,2.94591,0.0,0.0,2.658228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.702751,6.702751,0.0,0.0
2,5.316456,2.94591,2.94591,2.94591,2.658228,6.702751,0.0,0.0,3.351375,3.351375,0.0,0.0,3.351375,3.351375,0.0,0.0,0.0,0.0,2.94591,2.94591
3,2.658228,0.0,2.94591,2.94591,0.0,0.0,0.0,0.0,0.0,0.0,3.351375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.94591,2.94591
4,0.0,0.0,0.0,0.0,0.0,0.0,3.351375,3.351375,0.0,0.0,0.0,0.0,0.0,0.0,3.351375,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.351375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
tfidf_df.apply(l2_normalizer)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.666667,0.707107,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,0.666667,0.707107,0.707107,0.707107,0.57735,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.707107,0.707107
3,0.333333,0.0,0.707107,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.707107
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
count_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0
1,2,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0
2,2,1,1,1,1,2,0,0,1,1,0,0,1,1,0,0,0,0,1,1
3,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1
4,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [None]:
count_vectorizer.vocabulary_

{'a': 0,
 'a cat': 1,
 'are': 2,
 'are you': 3,
 'cat': 4,
 'lady': 5,
 'named': 6,
 'named quasi-moto': 7,
 'or': 8,
 'or a': 9,
 'person': 10,
 'pigeons': 11,
 'plant': 12,
 'plant lady': 13,
 'pup': 14,
 'the': 15,
 'was': 16,
 'was a': 17,
 'you': 18,
 'you a': 19}

In [None]:
tfidf = TfidfTransformer(norm="l2", use_idf=True, smooth_idf=True, sublinear_tf=True)

In [None]:
tfidf_vectors = tfidf.fit_transform(count_vectors)

In [None]:
tfidf_vectors.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.37847423,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.92561183, 0.        , 0.        , 0.        , 0.        ],
       [0.40787481, 0.28533265, 0.        , 0.        , 0.24089743,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.58914847, 0.58914847, 0.        , 0.        ],
       [0.33722839, 0.23591129, 0.23591129, 0.23591129, 0.19917252,
        0.48710435, 0.        , 0.        , 0.28769167, 0.28769167,
        0.        , 0.        , 0.28769167, 0.28769167, 0.        ,
        0.        , 0.        , 0.        , 0.23591129, 0.23591129],
       [0.3390679 , 0.        , 0.40161136, 0.40161136, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.4897614 , 0.        , 0.        , 0

In [None]:
import numpy as np

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [None]:
feature_extraction_pipeline = Pipeline(
    [
     ("count_vectorizer", count_vectorizer), 
     ("tfidf_transformer", tfidf)
    ]
)

In [None]:
clf_logistic = LogisticRegression(penalty="l2", 
                                  C=1.0, 
                                  solver="liblinear", 
                                  max_iter=1000,
                                  scoring
                                  )
clf_svm = SVC(C=1.0, 
              kernel="rbf",
              gamma="auto") 

In [None]:
clf_pipeline = Pipeline(
    [
     ("features", feature_extraction_pipeline), 
     ("classifier", clf_logistic)
    ]
)

In [None]:
estimater = clf_pipeline.fit(texts, labels)

In [None]:
X_test = ["my neighbors cat", "do you want a dog"]
Y_test = [0, 1]

In [None]:
preds = estimater.predict_proba(X_test)

In [None]:
preds

array([[0.53621213, 0.46378787],
       [0.46168848, 0.53831152]])

In [None]:
preds.argmax(axis=1)

array([0, 1])

In [None]:
param_grid = [
          {
              'features__count_vectorizer__ngram_range': [(1, 1), (1, 2)],
              'classifier__penalty': ["l2", "l1"],
              'classifier__C': np.logspace(-2, 2, 5)            
          }
          # {
          #     'features__min_df': [0.01, 0.05],
          #     'features__max_df': [0.7, 0.5], 
          #     'features__max_features': [20, 25], 
          #     'classifier__penalty': ["l2"],
          #     'classifier__C': np.logspace(-2, 2, 5)            
          # }
]

In [None]:
grid_search = GridSearchCV(estimator=clf_pipeline,
                           param_grid=param_grid,
                           scoring="f1_weighted",
                           cv=2, 
                           n_jobs=-1)

In [None]:
estimators = grid_search.fit(texts, labels)

In [None]:
estimators.best_estimator_

Pipeline(memory=None,
         steps=[('features',
                 Pipeline(memory=None,
                          steps=[('count_vectorizer',
                                  CountVectorizer(analyzer='word', binary=False,
                                                  decode_error='strict',
                                                  dtype=<class 'numpy.int64'>,
                                                  encoding='utf-8',
                                                  input='content',
                                                  lowercase=True, max_df=0.5,
                                                  max_features=20, min_df=0.01,
                                                  ngram_range=(1, 1),
                                                  preprocessor=<function preprocess at 0x7f14a7b60ef0>,
                                                  stop_words=No...
                                  TfidfTransformer(norm='l2', smooth_idf=True,
          

In [None]:
estimators.best_score_

0.35

In [None]:
best_estimator = estimators.best_estimator_

In [None]:
preds = best_estimator.predict(X_test)

array([0, 0])

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(preds, Y_test))