# Pushkar's dataset

In [None]:
import tensorflow as tf
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, X, y, batch_size=32, num_classes=None, shuffle=True):
        self.batch_size = batch_size
        self.X = X
        self.y = y
        self.indices = np.arange(X.shape[0])
        self.num_classes = num_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        return len(self.indices) // self.batch_size

    def __getitem__(self, index):
        index = self.index[index * self.batch_size:(index + 1) * self.batch_size]
        batch = [self.indices[k] for k in index]
        X, y = self.__get_data(batch)
        return X, y

    def on_epoch_end(self):
        self.index = np.arange(len(self.indices))
        if self.shuffle == True:
            np.random.shuffle(self.index)

    def __get_data(self, batch):
        X = self.X[batch].todense()
       
        y =  self.y[batch]

        return X, y

In [None]:
import numpy as np
data = np.load("/kaggle/input/cleaned-product-browse-node-classification/train_cleaned.npy", allow_pickle=True)
y = data[:,1]
X = data[:,0]

In [None]:
import pandas as pd
df = pd.DataFrame(columns=['text','class'])
df['text']=X
df['class']=y
df['text']= df.fillna("")

In [None]:
classes = df['class'].value_counts()[df["class"].value_counts()>170].index

In [None]:
df = df[df['class'].isin(classes)] 

In [None]:
df['class'].value_counts()

In [None]:
X = df["text"].to_numpy()
y = df["class"].to_numpy()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import VarianceThreshold
cv = TfidfVectorizer(max_features=15000)
X = cv.fit_transform(X)


In [None]:
X.shape

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y)

In [None]:
from tensorflow.keras.layers import Dense, Reshape, BatchNormalization, Dropout
from tensorflow.keras.models import Sequential

In [None]:
import sklearn
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.1)
train_datagen = DataGenerator(X, le.transform(y), shuffle=True, batch_size=2048)
test_datagen = DataGenerator(X_test, le.transform(y_test), shuffle=True, batch_size=2048)

In [None]:
def mish(x):
    return tf.keras.layers.Lambda(lambda x: x*tf.tanh(tf.math.log(1+tf.exp(x))))(x)

In [None]:
model = Sequential()
model.add(Dense(2048, tf.keras.activations.swish))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(2048, tf.keras.activations.swish))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(2048, tf.keras.activations.swish))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(1024, tf.keras.activations.swish))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Dense(len(np.unique(y)), "softmax"))
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=['accuracy'])

In [None]:
model.fit(train_datagen, epochs=10, validation_data = test_datagen)

In [None]:
test = np.load("/kaggle/input/cleaned-product-browse-node-classification/test_cleaned.npy", allow_pickle=True)

In [None]:
test_df = pd.DataFrame(columns=["PRODUCT_ID", "text"])

In [None]:
test_df["PRODUCT_ID"]=test[:,0]
test_df["text"]=test[:,1]

In [None]:
X_test = cv.transform(test[:,1])

In [None]:
pred = []
for i in range(0,X_test.shape[0],1000):
    p = model.predict_classes(X_test[i:i+1000].todense())
    pred+=list(p)

In [None]:
pred = le.inverse_transform(pred)

In [None]:
test_df["BROWSE_NODE_ID"]=pred
test_df.drop("text",axis=1, inplace=True)
test_df.to_csv("output27.csv",index=False)