# ***Importing Libraries***

In [None]:
!pip install textattack -q -q -q --exists-action i


import pandas as pd
import numpy as np 
import sys
from textattack.augmentation import EasyDataAugmenter

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import wordnet

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Input, Dropout, GRU, Lambda, Conv1D, MaxPooling1D


# ***Using NLTK and TextAttack for Augmentation***

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw')
! unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/
! unzip /usr/share/nltk_data/corpora/omw.zip -d /usr/share/nltk_data/corpora/

In [None]:
NUM = 8

In [None]:
from textattack.augmentation import EasyDataAugmenter
augmenter = EasyDataAugmenter(pct_words_to_swap=0.6,transformations_per_example=NUM)

# ***Importing Data***

In [None]:
train = pd.read_csv('/kaggle/input/math-problem-categorization/train.csv')
test = pd.read_csv('/kaggle/input/math-problem-categorization/test.csv')
sample_submission = pd.read_csv('/kaggle/input/math-problem-categorization/sample_submission.csv')


print(train.head())
print()
print(train.info())
print()

print(train.duplicated().value_counts())     ## Checking for duplicate entries
print()

print("Invalid Enties: ",(train['category']).isnull().sum())     ## Checking for invalid entries
print()

# ***Splitting Data into Train, Val & Test sets & Preprocessing Code***

In [None]:
import re
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('[0-9]+', '#', text)
    return text

In [None]:
x_train = list(train['problem'].apply(clean_text))
y_train = train['category'].to_numpy()
x_test = list(test["problem"].apply(clean_text))

num_classes = len(np.unique(y_train))      

In [None]:
class preprocess():
    def __init__(self):
        self.tokenizer = None
        self.max_len = None
        
    def __call__(self, corpus, y=None):
        if self.tokenizer == None:
            newcorpus=[]
            newlabels=[]
            for problem, category in zip(corpus, y):
                demo = [problem] + augmenter.augment(problem)
                newcorpus+= demo
                newlabels+= [category] * len(demo)
                print(len(newcorpus), len(newlabels))

            corpus = newcorpus
            y = newlabels
            
            self.tokenizer = Tokenizer(oov_token= '<OOV>', char_level = False, filters = '')     ## Tokenize input sentence
            self.tokenizer.fit_on_texts(corpus) 
            self.total_words = len(self.tokenizer.word_index) + 1                                 ## Vocabulary Size
            self.max_len = len(max(corpus, key=len))                                              ## Length of longest sentence
        
        corpus = self.tokenizer.texts_to_sequences(corpus)
        corpus = pad_sequences(corpus, maxlen = self.max_len, padding = 'pre')
    
        if y==None:
            return corpus
        else:
            return [corpus,y] 

# ***Final Augmented Data***

In [None]:
preprocessor = preprocess()
x_train, y_train = preprocessor(x_train, y_train)

new_x_test = []
for problem in x_test:
    new_x_test+= [problem] + augmenter.augment(str(problem))            
x_test = new_x_test

x_test = preprocessor(x_test)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, stratify = y_train)
x_train = np.array(x_train)
y_train = np.array(y_train)
x_val = np.array(x_val)
y_val = np.array(y_val)


# ***Model Checkpoint for saving most suitable weights***

In [None]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = 'checkpoints/weights.{epoch:02d}.hdf5',
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max')

# ***Getting Glove Embeddings***

In [None]:
# !wget https://nlp.stanford.edu/data/glove.840B.300d.zip
# !unzip glove*.zip

In [None]:
import gc
gc.collect()

In [None]:
EMBEDDING_FILE = '/kaggle/working/glove.840B.300d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300]
embeddings_index = dict(get_coefs(*o.split(" ")) for o in   open(EMBEDDING_FILE))
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = -0.005838499,0.48782197
embed_size = all_embs.shape[1]

In [None]:
def load_glove(word_index):
    nb_words = min(max_features, len(word_index)+1)
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        else:
            embedding_vector = embeddings_index.get(word.capitalize())
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix


max_features = preprocessor.total_words
embedding_matrix = load_glove(preprocessor.tokenizer.word_index)

# ***Model Architecture***

In [None]:
model = Sequential()
model.add(Embedding(max_features, 300,embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix)))

model.add(Bidirectional(LSTM(32, dropout = 0.5)))
model.add(tf.keras.layers.Flatten())

model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

# ***Training Model***

In [None]:
history = model.fit(x_train, y_train, epochs = 60, validation_data=(x_val, y_val), callbacks= [model_checkpoint_callback])

# ***Plotting History***

In [None]:
plt.style.use('ggplot')
plt.rcParams["figure.figsize"] = (16,4)
mylen = range(len(history.history["loss"]))

plt.subplot(1,2,1)
plt.plot(mylen, history.history["loss"], label="Train")
plt.plot(mylen, history.history["val_loss"], label="Val")
plt.title("Loss")
plt.legend()

plt.subplot(1,2,2)
plt.plot(mylen, history.history["accuracy"], label= "Train")
plt.plot(mylen, history.history["val_accuracy"], label="Val")
plt.title("Accuracy")
plt.legend()
plt.show()


# ***Loading Most Suitable Weights***

In [None]:
model.load_weights('checkpoints/weights.40.hdf5')

# ***Augmenting Test Dataset***

Each sample will be augmented multiple times, and i'll take mode of these pedictions.

In [None]:
x_test = list(test["problem"].apply(clean_text))
new_x_test=[]
for problem in x_test:
    demo= [problem] + augmenter.augment(str(problem))            
    demo = preprocessor(demo)
    new_x_test+= [[demo,len(demo)]]
    print([demo.shape,len(demo)])
    print("New Length:", len(new_x_test))
    
    
    

# ***Predicting Results***

In [None]:
soln=[]
for sample,length in new_x_test:
    total_preds = np.argmax(model(sample),axis = 1).reshape(-1,length)
    ans = stats.mode(total_preds,axis = 1, keepdims= False)[0].reshape(-1)
    print(ans, end="")
    soln += [ans[0]]

In [None]:
test['category'] = soln
test.to_csv("Raunaq.csv",index = False)

In [None]:
# from scipy import stats
# total_preds = np.argmax(model(x_test),axis = 1).reshape(-1,NUM)
# total_preds = stats.mode(total_preds,axis = 1)[0].reshape(-1)