## Tokenization

In [8]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /Users/santiago/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Using simple split

In [9]:
sentence = "hi, how are you?"
sentence.split()

['hi,', 'how', 'are', 'you?']

Using `word_tokenize()` function from nltk

In [10]:
word_tokenize(sentence)

['hi', ',', 'how', 'are', 'you', '?']

## CountVectorizer

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

# create a corpus of sentences
corpus = [
    "hello, how are you?",
    "im getting bored at home. And you? What do you think?",
    "did you know about counts",
    "let's see if this works!",
    "YES!!!!"
]

# initialize CountVectorizer with word_tokenize form nltk
ctv = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)

# fit the vectorizer on corpus
ctv.fit(corpus)

corpus_transformed = ctv.transform(corpus)

In [20]:
print(corpus_transformed)

  (0, 2)	1
  (0, 4)	1
  (0, 7)	1
  (0, 14)	1
  (0, 16)	1
  (0, 27)	1
  (1, 3)	1
  (1, 4)	2
  (1, 6)	1
  (1, 8)	1
  (1, 9)	1
  (1, 12)	1
  (1, 13)	1
  (1, 15)	1
  (1, 18)	1
  (1, 22)	1
  (1, 24)	1
  (1, 27)	2
  (2, 5)	1
  (2, 10)	1
  (2, 11)	1
  (2, 19)	1
  (2, 27)	1
  (3, 0)	1
  (3, 1)	1
  (3, 17)	1
  (3, 20)	1
  (3, 21)	1
  (3, 23)	1
  (3, 25)	1
  (4, 0)	4
  (4, 26)	1


In [21]:
print(ctv.vocabulary_)

{'hello': 14, ',': 2, 'how': 16, 'are': 7, 'you': 27, '?': 4, 'im': 18, 'getting': 13, 'bored': 9, 'at': 8, 'home': 15, '.': 3, 'and': 6, 'what': 24, 'do': 12, 'think': 22, 'did': 11, 'know': 19, 'about': 5, 'counts': 10, 'let': 20, "'s": 1, 'see': 21, 'if': 17, 'this': 23, 'works': 25, '!': 0, 'yes': 26}


## Logistic Regression with CountVectorizer

Let's get a baseline model to try to predict if a review is negative or positive

In [27]:
import pandas as pd

from nltk.tokenize import word_tokenize
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer


if __name__ == "__main__":
    df = pd.read_csv("../input/imbd.csv")
    
    # map positive to 1 and negative to 0
    df.sentiment = df.sentiment.apply(
        lambda x: 1 if x == "positive" else 0
    )
    
    # create a new column called kfold fill with -1
    df["kfold"] = -1
    
    # randomnize the row
    df = df.sample(frac=1).reset_index(drop=True)
    
    # fetch the labels
    y = df.sentiment.values
    
    # initialize the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=5)
    
    # fill the new kfold column
    for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
        df.loc[v_, "kfold"] = f
        
    # go over the created folds
    for fold_ in range(5):
        train_df = df[df["kfold"] != fold_].reset_index(drop=True)
        test_df = df[df["kfold"] == fold_].reset_index(drop=True)
        
        # initialize CountVectorizer with word_tokenize form nltk
        count_vec = CountVectorizer(
            tokenizer=word_tokenize,
            token_pattern=None
        )
        
        # fit count_vec on training data reviews
        count_vec.fit(train_df.review)
        
        # transform training and validation data reviews
        xtrain = count_vec.transform(train_df.review)
        xtest = count_vec.transform(test_df.review)
        
        # initialize logistic regression model
        model = linear_model.LogisticRegression()
        
        # fit the model on training data reviews and sentiment
        model.fit(xtrain, train_df.sentiment)
        
        # make predictions on test data
        preds = model.predict(xtest)
        
        # calculate accuracy
        accuracy = metrics.accuracy_score(test_df.sentiment, preds)
        
        print(f"Fold: {fold_}")
        print(f"Accuracy = {accuracy}")
        print("")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fold: 0
Accuracy = 0.8935



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fold: 1
Accuracy = 0.8887



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fold: 2
Accuracy = 0.8948



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fold: 3
Accuracy = 0.8931

Fold: 4
Accuracy = 0.8931



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## TF-IDF

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize

# create a corpus of sentences
corpus = [
    "hello, how are you?",
    "im getting bored at home. And you? What do you think?",
    "did you know about counts",
    "let's see if this works!",
    "YES!!!!"
]

# initialize TfidfVectorizer with word_tokenize form nltk
tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)

# fit the vectorizer on corpus
tfv.fit(corpus)

corpus_transformed = tfv.transform(corpus)

In [5]:
print(corpus_transformed)

  (0, 27)	0.2965698850220162
  (0, 16)	0.4428321995085722
  (0, 14)	0.4428321995085722
  (0, 7)	0.4428321995085722
  (0, 4)	0.35727423026525224
  (0, 2)	0.4428321995085722
  (1, 27)	0.35299699146792735
  (1, 24)	0.2635440111190765
  (1, 22)	0.2635440111190765
  (1, 18)	0.2635440111190765
  (1, 15)	0.2635440111190765
  (1, 13)	0.2635440111190765
  (1, 12)	0.2635440111190765
  (1, 9)	0.2635440111190765
  (1, 8)	0.2635440111190765
  (1, 6)	0.2635440111190765
  (1, 4)	0.42525129752567803
  (1, 3)	0.2635440111190765
  (2, 27)	0.31752680284846835
  (2, 19)	0.4741246485558491
  (2, 11)	0.4741246485558491
  (2, 10)	0.4741246485558491
  (2, 5)	0.4741246485558491
  (3, 25)	0.38775666010579296
  (3, 23)	0.38775666010579296
  (3, 21)	0.38775666010579296
  (3, 20)	0.38775666010579296
  (3, 17)	0.38775666010579296
  (3, 1)	0.38775666010579296
  (3, 0)	0.3128396318588854
  (4, 26)	0.2959842226518677
  (4, 0)	0.9551928286692534


## Logistic Regression with TF-IDF

In [6]:
import pandas as pd

from nltk.tokenize import word_tokenize
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn.feature_extraction.text import TfidfVectorizer


if __name__ == "__main__":
    df = pd.read_csv("../input/imbd.csv")
    
    # map positive to 1 and negative to 0
    df.sentiment = df.sentiment.apply(
        lambda x: 1 if x == "positive" else 0
    )
    
    # create a new column called kfold fill with -1
    df["kfold"] = -1
    
    # randomnize the row
    df = df.sample(frac=1).reset_index(drop=True)
    
    # fetch the labels
    y = df.sentiment.values
    
    # initialize the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=5)
    
    # fill the new kfold column
    for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
        df.loc[v_, "kfold"] = f
        
    # go over the created folds
    for fold_ in range(5):
        train_df = df[df["kfold"] != fold_].reset_index(drop=True)
        test_df = df[df["kfold"] == fold_].reset_index(drop=True)
        
        # initialize CountVectorizer with word_tokenize form nltk
        tfidf_vec = TfidfVectorizer(
            tokenizer=word_tokenize,
            token_pattern=None
        )
        
        # fit count_vec on training data reviews
        tfidf_vec.fit(train_df.review)
        
        # transform training and validation data reviews
        xtrain = tfidf_vec.transform(train_df.review)
        xtest = tfidf_vec.transform(test_df.review)
        
        # initialize logistic regression model
        model = linear_model.LogisticRegression()
        
        # fit the model on training data reviews and sentiment
        model.fit(xtrain, train_df.sentiment)
        
        # make predictions on test data
        preds = model.predict(xtest)
        
        # calculate accuracy
        accuracy = metrics.accuracy_score(test_df.sentiment, preds)
        
        print(f"Fold: {fold_}")
        print(f"Accuracy = {accuracy}")
        print("")

Fold: 0
Accuracy = 0.8933

Fold: 1
Accuracy = 0.9016

Fold: 2
Accuracy = 0.8967



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fold: 3
Accuracy = 0.8935

Fold: 4
Accuracy = 0.9001

