In [2]:
# import necessary packages
import pandas as pd
from sklearn import model_selection

###  Creating 5 folds in the input csv

In [None]:
def create_folds(path):
    
    # loading the csv file
    data = pd.read_csv(path)
    
    # we create a new column called kfold and fill it with -1
    data["kfold"] = -1
    
    # map positive to 1 and negative to 0
    data.sentiment = data.sentiment.apply(
                lambda x : 1 if x =="positive" else 0
    )
    
    # the next step is to randomize the rows of the data
    data = data.sample(frac=1).reset_index(drop=True)
    
    y = data.sentiment.values
    
    skf = model_selection.StratifiedKFold(n_splits=5)
    
    for idx, (t_, v_) in enumerate(skf.split(X=data, y=y)):
        data.loc[v_, "kfold"] = idx
        
    print("Folds csv created!!")
    return data.to_csv("../input/IMDB_Dataset-experimented_folds.csv", index=False)

if __name__ == "__main__":
    path = "../input/IMDB_Dataset.csv"
    create_folds(path)

### Training the logistic Regression model on IMDb Dataset with Bag of words Approach

In [2]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize

In [4]:
def run(fold):
    
    df = pd.read_csv("../input/IMDB_Dataset-experimented_folds.csv")
    
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    ctv = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)
    
    # CountVectorizer() is fitted on training data as fitting on the whole dataset will take a lot of time
    # and vocabulary (words used in sentences) is more-over same in test & train. 
    # So by fitting on whole dataset is just repeation of the words & time consuming.
    corpus_trsfmed = ctv.fit(df_train.review)
    
    x_train = corpus_trsfmed.transform(df_train.review)
    y_train = df_train.sentiment.values
    
    x_valid = corpus_trsfmed.transform(df_valid.review)   
    y_valid = df_valid.sentiment.values
    
    model = linear_model.LogisticRegression(solver='lbfgs', max_iter=1000)
    
    model.fit(x_train, y_train)
    
    preds = model.predict(x_valid)
    
    accuracy = metrics.accuracy_score(preds, y_valid)
    
    print(f"\t Fold: {fold}, Accuracy score: {accuracy}")
    
if __name__ == "__main__":
    for fold in range(5):
        run(fold)

KeyboardInterrupt: 

****

**CONCLUSION: Wow, we are already at 89% accuracy, and all we did was use bag of words with
logistic regression! This is super amazing! However, this model took a lot of time
to train, let’s see if we can improve the time by using naïve bayes classifier. Naïve
bayes classifier is quite popular in NLP tasks as the sparse matrices are huge and
naïve bayes is a simple model. To use this model, we need to change one import
and the line with the model. Let’s see how this model performs. We will use
MultinomialNB from scikit-learn.**

****

### Training the Naive Bayes model on IMDb Dataset with Bag of words Approach

In [16]:
import pandas as pd
from sklearn import naive_bayes
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
def run(fold):
    df = pd.read_csv("../input/IMDB_Dataset-experimented_folds.csv")
    
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold != fold].reset_index(drop=True)
    
    ctv = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)
    
    count_vec = ctv.fit(df_train.review)
    
    x_train = count_vec.transform(df_train.review)
    y_train = df_train.sentiment.values
    
    x_valid = count_vec.transform(df_valid.review)
    y_valid = df_valid.sentiment.values
    
    model = naive_bayes.MultinomialNB()
    
    model.fit(x_train, y_train)
    
    preds = model.predict(x_valid)
    
    acc = metrics.accuracy_score(preds, y_valid)
    
    print(f" Fold: {fold}, Accuracy: {acc}")
    
if __name__ == "__main__":
    for fold in range(5):
        run(fold)

 Fold: 0, Accuracy: 0.896075
 Fold: 1, Accuracy: 0.89605
 Fold: 2, Accuracy: 0.8952
 Fold: 3, Accuracy: 0.898925
 Fold: 4, Accuracy: 0.89855


****

**Conclusion: We see that this score is low. But the naïve bayes model is superfast.**

****

## Note:

**here we are getting float for each word, whereas in CountVectorizer getting count of each word in a sentence.
The drawback of CountVectorizer is:
    different words have same count, may have different index but have same count.
    word having greater count have more influence thus the approach is not that good**

TF-IDF represent count by float so better than CountVectorizer, still this appproach there is some influence of the larger number.


State of Art approach is embeddig.


## Let's try Tfdif Approach

In [9]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn import linear_model

In [11]:
def run(fold):
    
    df = pd.read_csv("../input/IMDB_Dataset-folds.csv")
    
    df_train = df[df["kfold"]!=fold].reset_index(drop=True)
    df_test = df[df["kfold"]==fold].reset_index(drop=True)
    
    tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
    tfv.fit(df_train.review)
    
    x_train = tfv.transform(df_train.review)
    x_test = tfv.transform(df_test.review)
    
    logres = linear_model.LogisticRegression(max_iter=1000)
    
    # fit the model on training data reviews and sentiment
    logres.fit(x_train, df_train.sentiment.values)
    
    # make predictions on test data threshold for predictions is 0.5
    ypreds = logres.predict(x_test)
    
    acc = metrics.accuracy_score(ypreds, df_test.sentiment.values)
    
    print(f"Fold: {fold}, Accuracy: {acc}")
    
if __name__ == "__main__":
    for fold in range(5):
        run(fold)

Fold: 0, Accuracy: 0.8981
Fold: 1, Accuracy: 0.9016
Fold: 2, Accuracy: 0.8964
Fold: 3, Accuracy: 0.893
Fold: 4, Accuracy: 0.8971


****

**Conclusion: We see that these scores are a bit higher than CountVectorizer, and thus, it becomes
the new benchmark that we would want to beat.**

****

## Another interesting concept in NLP is n-grams. 

**Both CountVectorizer and TfidfVectorizer implementations of scikit-learn offers ngrams
by ngram_range parameter, which has a minimum and maximum limit. By
default, this is (1, 1). When we change it to (1, 3), we are looking at unigrams,
bigrams and trigrams.**

### Let's Try this approach

**TfidfVectorizer with trigrams ngram_range=(1, 3) whereas for bigrams ngram_range=(1, 2)**

In [6]:
import pandas as pd
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
from sklearn import linear_model

In [7]:

def sklearn(fold):
    
    df = pd.read_csv("../input/IMDB_Dataset-folds.csv")
    
    df_train = df[df["kfold"]!=fold].reset_index(drop=True)
    df_test = df[df["kfold"]==fold].reset_index(drop=True)
    
    tfv = TfidfVectorizer(
                tokenizer=word_tokenize, 
                token_pattern=None,
                ngram_range=(1, 2)   # this parameter offers bi-gram
    )
    tfv.fit(df_train.review)
    
    x_train = tfv.transform(df_train.review)
    x_test = tfv.transform(df_test.review)
    
    logres = linear_model.LogisticRegression(max_iter=1000)
    
    # fit the model on training data reviews and sentiment
    logres.fit(x_train, df_train.sentiment.values)
    
    # make predictions on test data threshold for predictions is 0.5
    ypreds = logres.predict(x_test)
    
    acc = metrics.accuracy_score(ypreds, df_test.sentiment.values)
    
    print(f"Fold: {fold}, Accuracy: {acc}")
    
if __name__ == "__main__":
    for fold in range(5):
        run(fold)

Fold: 0, Accuracy: 0.8962
Fold: 1, Accuracy: 0.9037
Fold: 2, Accuracy: 0.8975
Fold: 3, Accuracy: 0.896
Fold: 4, Accuracy: 0.9041


### TfidfVectorizer with trigrams ngram_range=(1, 3) wreas for bigrams ngram_range=(1, 2)

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model


def run(fold):

    df = pd.read_csv("../inputs/IMDB_Dataset-folds.csv")

    df_train = df[df.kfold != fold].reset_index(drop=True)

    df_valid = df[df.kfold == fold].reset_index(drop=True)

    tfv_trigram = TfidfVectorizer(
        tokenizer=word_tokenize,
        token_pattern=None,
        ngram_range=(1, 3)
    )

    tfv_trigram.fit(df_train.review)

    x_train = tfv_trigram.transform(df_train.review)

    x_valid = tfv_trigram.transform(df_valid.review)

    model = linear_model.LogisticRegression()

    model.fit(x_train, df_train.sentiment.values)

    yhat = model.predict(x_valid)

    acc = metrics.accuracy_score(yhat, df_valid.sentiment.values)

    print(f"Fold: {fold}, Accuracy: {acc}")


if __name__ == "__main__":
    for fold in range(5):
        run(fold)

## There is one more topic i.e Topic Extraction which can be performed using SVD. 

**>> Refer understanding_svd_tf for this topic && Next is word_embedding for ML Model and
then for LSTM & Transformer**

## Understanding sentense

In [16]:
embedding_dict = {
    "the": [1, 2, 3],
    "is":[7, 9, 1],
    "map":[0, 2, 1],
    "boy":[2, 3, 4]
    
}

In [38]:

def sentence_vec(text, embedding_dict, stopwords, tokenizer):
    """
    Given a sentence and other parameters this function returns embeddings for whole sentence.
    1. Takes the sent and performed pre-processing
    2. For each word from the sent pull_out vectors associated with the word and store inthe list.
    3. List would be like : [[x1, x2, x3, x4....... x300---->> for word1],
                             [x1, x2, x3, x4....... x300---->> for word2],
                             [x1, x2, x3, x4....... x300---->> for word3]]
                            
                            
    : param text : any input sentence
    : param embedding_dict : dict {word:vector}
    : param stopwords : list of stopswords
    : param tokenizer : a tokenization func
    """
    
    # converting the text to the lower case
    words = str(text).lower()
    
    # tokenization of the sentence
    words = tokenizer(words)
    
    # removing the stopwords from the words 
    words = [word for word in words if word not in stopwords]
    
    # keeping only alpha-numeric tokens
    words = [word for word in words if word.isalpha()]
      
    M = []
    
    for word in words:
        # if word as key in embedding_dict then store its value in list.
        if word in embedding_dict:
            M.append(embedding_dict[word])
            
    if len(M) ==0:
        return np.zeros(300)
    
    print(M)
    M = np.array(M)
    
    print("Array storing the embeddings\n", M)
    
    # calculate sum along row, i.e for each sentences
    v = M.sum(axis=0)
    print(v)
    # Normalizing the vector
    return v/np.sqrt((v**2).sum())

In [39]:
from nltk.tokenize import word_tokenize
sentence_vec(text="the boy", embedding_dict=embedding_dict, stopwords=[], tokenizer=word_tokenize)

[[1, 2, 3], [2, 3, 4]]
Array storing the embeddings
 [[1 2 3]
 [2 3 4]]
[3 5 7]


array([0.32929278, 0.5488213 , 0.76834982])

In [48]:
x = [[1, 2, 3], [2, 3, 4]]
x = np.array(x)
x

array([[1, 2, 3],
       [2, 3, 4]])

In [49]:
x = x.sum(axis=0)
x

array([3, 5, 7])

In [50]:
x/np.sqrt((x**2).sum())

array([0.32929278, 0.5488213 , 0.76834982])