<a href="https://colab.research.google.com/github/smrigank/Technical-Discussion/blob/main/Industry_classification_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
from google.colab import files 
uploaded = files.upload()

Saving Industry_classification.csv to Industry_classification.csv


In [None]:
import pandas as pd
data = pd.read_csv('/content/Industry_classification.csv')

In [None]:
data.Industry.value_counts()

Manufacturing      424
Agriculture        319
Infrastructure     284
Travel&Tourism     186
Energy_Fuel        176
Services           151
BFI                144
Others             125
Pharma             109
Construction        91
IT_ITES             91
Mining              85
Consumer_goods      73
Food_Processing     69
Education           68
Communication       68
Gems_Jewellry       47
Name: Industry, dtype: int64

In [None]:
categories = data['Industry'].unique()
categories = categories.tolist()
print(type(categories))
print(categories)

<class 'list'>
['Manufacturing', 'BFI', 'Agriculture', 'Travel&Tourism', 'Infrastructure', 'Food_Processing', 'Energy_Fuel', 'Communication', 'Construction', 'Services', 'Mining', 'Pharma', 'Consumer_goods', 'Education', 'IT_ITES', 'Gems_Jewellry', 'Others']


In [None]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
data['Industry'] = label_encoder.fit_transform(data['Industry'])
data['Industry'].unique()

array([11,  1,  0, 16, 10,  7,  6,  2,  3, 15, 12, 14,  4,  5,  9,  8, 13])

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42, shuffle=True)

In [None]:
train_df.shape, test_df.shape

((2008, 2), (502, 2))

In [None]:
training_docs = train_df['Content'].tolist()
training_label = train_df['Industry'].tolist()
testing_docs = test_df['Content'].tolist()
testing_label = test_df['Industry'].tolist()

In [None]:
type(testing_docs)

list

In [None]:
vocab_size = 20000  # Only consider the top 20k words
maxlen = 300
oov_tok = "<OOV>"

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)
tokenizer.fit_on_texts(training_docs)

training_sequences = tokenizer.texts_to_sequences(training_docs)
testing_sequences = tokenizer.texts_to_sequences(testing_docs)

x_train = keras.preprocessing.sequence.pad_sequences(training_sequences, maxlen=maxlen)
x_val = keras.preprocessing.sequence.pad_sequences(testing_sequences, maxlen=maxlen)

In [None]:

embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(17, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [None]:
model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(
    x_train, training_label, batch_size=32, epochs=10, validation_data=(x_val, training_label)
)

ValueError: ignored