In [None]:
!pip install nltk
!pip install gensim
!pip install -U -q PyDrive
!pip install keras-tuner

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
downloaded = drive.CreateFile({'id':'1jkfp6FnOrhhDzpVeWQrZxJhr3w_n8DMX'})
downloaded.GetContentFile('keywords.txt')

In [None]:
downloaded = drive.CreateFile({'id':'1aji9cGzB0edut2UVqQC_tkewtSAQcf6p'}) # replace the id with id of file you want to access
downloaded.GetContentFile('sample1.csv')

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings

warnings.filterwarnings(action = 'ignore')

import gensim
from gensim.models import Word2Vec

#  Reads ‘alice.txt’ file
sample = open('keywords.txt',"r")
s = sample.read()

# Replaces escape character with space
f = s.replace("\n", " ")

keywords = []

# iterate through each sentence in the file
for i in sent_tokenize(f):
    temp = []

    # tokenize the sentence into words
    for j in word_tokenize(i):
        temp.append(j.lower())

    keywords.append(temp)

# Create CBOW model
model = gensim.models.Word2Vec(keywords, min_count = 1,
                              size = 100, window = 5)

#model = gensim.models.Word2Vec(keywords, min_count = 1, size = 100,
 #                                            window = 5, sg = 1)

words = list(model.wv.vocab)
embeddings_index = {}
for word in words:
  embeddings_index[word]=model.wv[word]

#for attr, value in embeddings_index.items():
  #print(str(attr)+" "+str(value))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import keras
from keras import optimizers
from keras import backend as K
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.utils import plot_model
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import os, re, csv, math, codecs

sns.set_style("whitegrid")
np.random.seed(0)

MAX_NB_WORDS = 100000
tokenizer = RegexpTokenizer(r'\w+')

In [None]:
data = pd.read_csv("sample1.csv")
ngrams_list = data['ngram feature vector'].tolist()
label_list  = data['class'].tolist()
one_hot_labels = keras.utils.to_categorical(label_list, num_classes=2)

In [None]:
processed_list = []
for sentence in tqdm(ngrams_list):
    tokens = sentence.lower().split()
    filtered = [word for word in tokens if word not in keywords]
    processed_list.append(" ".join(filtered))

100%|██████████| 92/92 [00:00<00:00, 231063.45it/s]


In [None]:
df = pd.DataFrame()
df["ngrams"] = processed_list
df['doc_len'] = df['ngrams'].apply(lambda words: len(words.split(" ")))
max_seq_len = np.round(df['doc_len'].mean() + df['doc_len'].std()).astype(int)
max_seq_len

4

In [None]:
x_train, x_test, y_train, y_test = train_test_split(processed_list, one_hot_labels, test_size=0.3)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train + x_test)
word_seq_train = tokenizer.texts_to_sequences(x_train)
word_seq_test = tokenizer.texts_to_sequences(x_test)
word_index = tokenizer.word_index
print("dictionary size: ", len(word_index))

dictionary size:  15


In [None]:
num_tokens = [len(tokens) for tokens in word_seq_train + word_seq_test]
num_tokens = np.array(num_tokens)

In [None]:
word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len)
word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_seq_len)

In [None]:
word_seq_test = word_seq_test[:-1]
y_test = y_test[:-1]

In [None]:
#training params
batch_size = 4
num_epochs = 8

#model parameters
num_filters = 64
embed_dim = 100
weight_decay = 1e-4

In [None]:
embedding_matrix = np.zeros((len(word_index)+1, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_index[word]
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
model = Sequential()
model.add(Embedding(len(word_index)+1, embed_dim, weights=[embedding_matrix], input_length=max_seq_len, trainable=False))
model.add(Dropout(0.2))
model.add(Conv1D(num_filters, 5, padding='same', activation='relu', strides=1))
model.add(MaxPooling1D(2))
model.add(Conv1D(num_filters, 5, activation='relu', padding='same'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 4, 100)            1600      
_________________________________________________________________
dropout_3 (Dropout)          (None, 4, 100)            0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 4, 64)             32064     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 2, 64)             0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 2, 64)             20544     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 64)                0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 64)               

In [None]:
model.fit(word_seq_train, y_train)

Epoch 1/1


<keras.callbacks.History at 0x7fe3400c3ba8>

In [None]:
model.evaluate(word_seq_test, y_test)



[0.6968455910682678, 0.5555555820465088]