## REQUIREMENTS

In [8]:
import sys
sys.path.append("/content/drive/MyDrive/DL_projects/text_classification/src")

In [2]:
import sys 
import json
import torch
import nltk
import pickle
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import re
import gensim
import config
import helper
from sklearn.model_selection import train_test_split
from model import ClassifierModel
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
sw = stopwords.words('english')
from gensim.models import KeyedVectors
import model
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [3]:
config.local_base_dir = "/content/drive/MyDrive/DL_projects/text_classification/"
config.local_train_file = "dataset/imdb_dataset.csv"
config.prep_train_test_vocabed = "imdb_train_test_vocabed.pkl"
config.EMBED_SIZE = 32

In [44]:
data = pd.read_csv(config.local_base_dir +config.local_train_file)

In [45]:
mapping = {"positive" : 1,"negative":0}
data['sentiment'] = data['sentiment'].map(mapping) 
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


## PREPROCESSING

In [46]:
wordnet_lemmatizer = WordNetLemmatizer()
porter_stemmer  = PorterStemmer()

def preprocess_text(text):
    text = re.sub(r"^[A-Za-z]","",text)
    # text = " ".join([x for x in text.split() if x not in sw])
    # text = " ".join([wordnet_lemmatizer.lemmatize(x) for x in text.split() ])
    # # text = " ".join([porter_stemmer.stem(x) for x in text.split()])
    # text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text

In [47]:
def data_length(dataframe):
    all_words = set()
    all_sentence = list()
    for sentence in dataframe["prep_data"]:
        all_sentence.append(sentence.lower().split())
        for word in sentence.lower().split():
            all_words.add(word)
    len_all_words = len(all_words)
    len_all_sentence = len(all_sentence)
    print(f"Total number of words : {len_all_words}")
    print(f"Total number of sentence : {len_all_sentence}")
    print()
    return all_words,all_sentence

In [48]:
if __name__ == "__main__":
    prep_data = data["review"].apply(lambda x : preprocess_text(x)) 
    data["prep_data"] = prep_data
    all_words,all_sentence = data_length(data)
    samples_lbl = data["sentiment"]
    X_train, X_test, y_train, y_test = train_test_split(all_sentence,samples_lbl,test_size=0.2,shuffle=True)


Total number of words : 394208
Total number of sentence : 50000



In [49]:
data.head()

Unnamed: 0,review,sentiment,prep_data
0,One of the other reviewers has mentioned that ...,1,ne of the other reviewers has mentioned that a...
1,A wonderful little production. <br /><br />The...,1,wonderful little production. <br /><br />the ...
2,I thought this was a wonderful way to spend ti...,1,thought this was a wonderful way to spend tim...
3,Basically there's a family where a little boy ...,0,asically there's a family where a little boy (...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,"etter mattei's ""love in the time of money"" is ..."


In [50]:
data['sentiment']

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64

In [51]:

pickle.dump({
    "X_train" : X_train,
    "X_test" : X_test,
    "y_train" : y_train.values.tolist(),
    "y_test" : y_test.values.tolist(),
},open(config.local_base_dir + "dataset/"+ config.prep_train_test_vocabed,'wb'))



In [52]:
config.local_base_dir + "dataset/"+ config.prep_train_test_vocabed

'/content/drive/MyDrive/DL_projects/text_classification/dataset/imdb_train_test_vocabed.pkl'

In [53]:
split_data = pickle.load(open(config.local_base_dir + "dataset/"+ config.prep_train_test_vocabed,'rb'))

# **Word to Vectors**

In [54]:
def build_word2vec(all_sentence,embedding_size):
    w2v_model = gensim.models.Word2Vec(sentences=all_sentence,min_count=1,vector_size= embedding_size)
    w2v_model.build_vocab(all_sentence)
    print("Length of samples : ",w2v_model.corpus_count)
    print("Length of vocab   : ",len(w2v_model.wv.key_to_index))
    print("Training and saving model...")
    w2v_model.train(all_sentence,total_examples=w2v_model.corpus_count,epochs=w2v_model.epochs)
    w2v_model.save(f"{config.local_base_dir}dataset/prep_word2vectors_{config.EMBED_SIZE}.model")
    wordvecs = KeyedVectors.load(f"{config.local_base_dir}dataset/prep_word2vectors_{config.EMBED_SIZE}.model")    
    all_words = list(wordvecs.wv.key_to_index.keys())
# wordvecs.wv.index_to_key = {v:k for k,v in wordvecs.wv.key_to_index.items()}
    word2index = {k:v+1 for k,v in wordvecs.wv.key_to_index.items()}
    index2word = {v:k for k,v in word2index.items()}
    
    matrix_vec = np.zeros((len(word2index)+1,config.EMBED_SIZE))
    for word,idx in word2index.items():
        vector_x = wordvecs.wv[word]
        matrix_vec[idx,:] = vector_x    
    pickle_data = {
    "word2index" : word2index,
    "index2word" : index2word,
    "embedding_vector" : matrix_vec
    }
    pickle.dump(pickle_data,open(config.local_base_dir+"dataset/prep_emb_vec.pkl",'wb'))
    print("Done")
    print(f'Model saved to {config.local_base_dir+"dataset/prep_emb_vec.pkl"}')
    

In [55]:
config.EMBED_SIZE = 32
build_word2vec(all_sentence,config.EMBED_SIZE)

Length of samples :  50000
Length of vocab   :  394208
Training and saving model...
Done
Model saved to /content/drive/MyDrive/DL_projects/text_classification/dataset/prep_emb_vec.pkl


In [2]:
import pickle

In [3]:
pickle_data =pickle.load(open("/content/drive/MyDrive/DL_projects/text_classification/trained_models/seq2seq_hidden_32_embed_32_imdb_word2vec.pt",'rb'))

UnpicklingError: A load persistent id instruction was encountered,
but no persistent_load function was specified.

In [57]:
word2index = pickle_data['word2index']
index2word = pickle_data['index2word']
word_vectors = pd.DataFrame(pickle_data['embedding_vector'])
word_vectors = word_vectors.iloc[:1000,:]
word_vectors.to_csv("imdb_embedding_matrix.tsv",sep ="\t",header = None,index = False)
index2word[0] = "[PAD]"
only_words = [index2word[x] for x in range(len(index2word))]
word_vectors["metadata"] =only_words
word_vectors.to_csv("imdb_meta_data.tsv",sep ="\t",index = False)

In [58]:
word_vectors = word_vectors.iloc[:1000,:]

(1000, 33)

In [67]:
import torch
checkpoint = torch.load("/content/drive/MyDrive/DL_projects/text_classification/trained_models/seq2seq_hidden_32_embed_32_imdb_prep_word2vec.pt")
checkpoint.keys()

dict_keys(['model_state_dict', 'params'])

In [68]:
params = checkpoint['params']
pickle_data =pickle.load(open(params["base_dir"] +params["emb_vec_file"],'rb'))
pickle_data.keys()

word2index = pickle_data['word2index']
index2word = pickle_data['index2word']

In [69]:
# n_vocab,hidden_size,out_hidden,embedding_dim,n_labels,max_seq
import model
my_model = model.ClassifierModel(len(word2index),params["HIDDEN_SIZE"],params["OUT_DIM"],params["EMBED_SIZE"],n_labels = params["n_labels"],max_seq=params['max_seq_len'])

In [70]:
my_model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [71]:
import pandas as pd
emb_matrix = my_model.embedding.weight.detach().numpy()

In [72]:
emb = pd.DataFrame(emb_matrix)

In [73]:
emb.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.215135,-2.039822,-1.991017,-1.874602,3.37553,-2.805663,1.647659,-2.811729,-3.131666,-1.880979,...,-0.563289,3.812085,-0.895236,0.120245,3.119622,0.190353,-1.900077,-2.479097,-0.22915,-0.14041
2,2.682374,-0.05988,-1.709466,0.929107,-0.586223,-0.812075,3.737223,-1.679718,-4.843754,1.652838,...,-0.728424,5.129432,-1.086249,-1.162141,0.742907,-1.912346,-3.197052,-2.823757,1.761539,-2.172132
3,3.583761,-2.87929,0.511685,-0.695925,3.157705,-0.685953,2.243956,-2.056694,-3.179038,-1.172965,...,0.711902,-0.934894,-2.205631,-0.492827,1.623653,-0.510843,-3.563397,-2.351779,0.207879,-1.253219
4,0.920667,-4.159053,1.667749,-2.287287,0.812242,-2.126994,3.033233,-4.145588,-2.513504,0.050523,...,-1.246631,1.154336,-0.579854,-2.764482,2.871315,-0.273873,-0.568649,-0.442415,-2.348348,-0.9895


In [74]:
emb.shape

(394209, 32)

In [75]:
len(index2word)

394208

In [76]:
emb = emb.iloc[:10000,:]

In [78]:
emb.to_csv("imdb_prep_embedding_matrix.tsv",sep = '\t',header=None,index = False)

In [79]:
pd.read_csv("/content/drive/MyDrive/DL_projects/text_classification/src/interpret/imdb_prep_embedding_matrix.tsv",sep = '\t').head()

Unnamed: 0,0.0,0.0.1,0.0.2,0.0.3,0.0.4,0.0.5,0.0.6,0.0.7,0.0.8,0.0.9,...,0.0.22,0.0.23,0.0.24,0.0.25,0.0.26,0.0.27,0.0.28,0.0.29,0.0.30,0.0.31
0,3.215135,-2.039822,-1.991017,-1.874602,3.37553,-2.805663,1.647659,-2.811729,-3.131666,-1.880979,...,-0.563289,3.812085,-0.895236,0.120245,3.119622,0.190353,-1.900078,-2.479097,-0.22915,-0.14041
1,2.682374,-0.05988,-1.709466,0.929107,-0.586223,-0.812075,3.737223,-1.679718,-4.843754,1.652837,...,-0.728424,5.129432,-1.086249,-1.162141,0.742907,-1.912346,-3.197052,-2.823757,1.76154,-2.172132
2,3.583761,-2.87929,0.511685,-0.695925,3.157705,-0.685953,2.243956,-2.056694,-3.179038,-1.172965,...,0.711902,-0.934894,-2.205631,-0.492827,1.623653,-0.510843,-3.563397,-2.351778,0.207879,-1.253219
3,0.920667,-4.159053,1.667749,-2.287288,0.812242,-2.126994,3.033233,-4.145588,-2.513504,0.050523,...,-1.24663,1.154336,-0.579854,-2.764482,2.871315,-0.273873,-0.568649,-0.442415,-2.348348,-0.9895
4,4.590484,-5.857524,-4.149547,1.851173,0.939141,0.385021,0.859584,-6.05591,1.353823,3.357303,...,0.859147,0.668078,0.218401,2.529448,3.724398,0.44508,-4.904037,1.30223,3.461204,1.466099


In [80]:
index2word[0] = "[PAD]"

In [81]:
word_vectors = pd.read_csv("/content/drive/MyDrive/DL_projects/text_classification/src/interpret/imdb_prep_embedding_matrix.tsv",sep = '\t',header = None)
print(word_vectors.shape,len(index2word))
word_vectors.head()
print(word_vectors.shape)

(10000, 32) 394209
(10000, 32)


In [82]:
only_words = [index2word[x] for x in range(len(index2word))]

In [83]:
word_vectors.shape,len(only_words),only_words[-1]

((10000, 32), 394209, 'firm);')

In [84]:
word_vectors.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.215135,-2.039822,-1.991017,-1.874602,3.37553,-2.805663,1.647659,-2.811729,-3.131666,-1.880979,...,-0.563289,3.812085,-0.895236,0.120245,3.119622,0.190353,-1.900078,-2.479097,-0.22915,-0.14041
2,2.682374,-0.05988,-1.709466,0.929107,-0.586223,-0.812075,3.737223,-1.679718,-4.843754,1.652837,...,-0.728424,5.129432,-1.086249,-1.162141,0.742907,-1.912346,-3.197052,-2.823757,1.76154,-2.172132
3,3.583761,-2.87929,0.511685,-0.695925,3.157705,-0.685953,2.243956,-2.056694,-3.179038,-1.172965,...,0.711902,-0.934894,-2.205631,-0.492827,1.623653,-0.510843,-3.563397,-2.351778,0.207879,-1.253219
4,0.920667,-4.159053,1.667749,-2.287288,0.812242,-2.126994,3.033233,-4.145588,-2.513504,0.050523,...,-1.24663,1.154336,-0.579854,-2.764482,2.871315,-0.273873,-0.568649,-0.442415,-2.348348,-0.9895


In [85]:
only_words = only_words[:10000]

In [86]:
word_vectors["metadata"] =only_words

In [87]:
word_vectors.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,metadata
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[PAD]
1,3.215135,-2.039822,-1.991017,-1.874602,3.37553,-2.805663,1.647659,-2.811729,-3.131666,-1.880979,...,3.812085,-0.895236,0.120245,3.119622,0.190353,-1.900078,-2.479097,-0.22915,-0.14041,the
2,2.682374,-0.05988,-1.709466,0.929107,-0.586223,-0.812075,3.737223,-1.679718,-4.843754,1.652837,...,5.129432,-1.086249,-1.162141,0.742907,-1.912346,-3.197052,-2.823757,1.76154,-2.172132,a
3,3.583761,-2.87929,0.511685,-0.695925,3.157705,-0.685953,2.243956,-2.056694,-3.179038,-1.172965,...,-0.934894,-2.205631,-0.492827,1.623653,-0.510843,-3.563397,-2.351778,0.207879,-1.253219,and
4,0.920667,-4.159053,1.667749,-2.287288,0.812242,-2.126994,3.033233,-4.145588,-2.513504,0.050523,...,1.154336,-0.579854,-2.764482,2.871315,-0.273873,-0.568649,-0.442415,-2.348348,-0.9895,of


In [88]:
word_vectors.to_csv("imdb_prep_meta_data.tsv",sep ="\t",index = False)

In [89]:
pd.read_csv("imdb_prep_meta_data.tsv",sep ="\t").head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,metadata
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[PAD]
1,3.215135,-2.039822,-1.991017,-1.874602,3.37553,-2.805663,1.647659,-2.811729,-3.131666,-1.880979,...,3.812085,-0.895236,0.120245,3.119622,0.190353,-1.900078,-2.479097,-0.22915,-0.14041,the
2,2.682374,-0.05988,-1.709466,0.929107,-0.586223,-0.812075,3.737223,-1.679718,-4.843754,1.652837,...,5.129432,-1.086249,-1.162141,0.742907,-1.912346,-3.197052,-2.823757,1.76154,-2.172132,a
3,3.583761,-2.87929,0.511685,-0.695925,3.157705,-0.685953,2.243956,-2.056694,-3.179038,-1.172965,...,-0.934894,-2.205631,-0.492827,1.623653,-0.510843,-3.563397,-2.351778,0.207879,-1.253219,and
4,0.920667,-4.159053,1.667749,-2.287288,0.812242,-2.126994,3.033233,-4.145588,-2.513504,0.050523,...,1.154336,-0.579854,-2.764482,2.871315,-0.273873,-0.568649,-0.442415,-2.348348,-0.9895,of


In [86]:
word_vectors

(140808, 17)

In [90]:
test_data = pd.read_csv("/content/drive/MyDrive/DL_projects/text_classification/dataset/imdb_dataset.csv")[:100]

In [91]:
test_data.iloc[0]['review']

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [92]:
for i in range(10):
    if test_data['sentiment'][i] == 'positive':
        print("POSITIVE")
        print(test_data.iloc[i]['review'])
        print()
    else:
        print("NEGATIVE")
        print(test_data.iloc[i]['review'])
        print()

POSITIVE
One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due t

In [130]:
for i in range(5):
    if test_data[1][i] == 1:
        print("HAM")
        print(test_data[3][i])
        print()

    else:
        print("SPAM")
        print(test_data[3][i])
        print()


# test_data[3]

HAM
I have done a lot of international travel, both on business and as a tourist. For both types I assure you the best advice is also the oldest: Always drink the wine of the country. In this movie the archangel Michael comes to Earth on business, wraps it up quickly and decides to hang around for a little touring. Boy! Does he "drink the wine of the country."<br /><br />Could man be drunk forever with liquor, love and fights <br /><br />He'd lief rise up of mornings and lief lie down of nights.<br /><br />These are things you can't do in Heaven so he enjoys them while he's here! Of course it turns out he had a couple of other jobs to tackle and, if he is less direct about them than he was about the first one, he is just as successful. The final scene is a little schmaltzy but it is also wonderful. Jean Stapleton gets to dance with John Travolta.

HAM
One of the most frightening game experiences ever that will make you keep the lights on next to your bed. Great storyline with a romanti