In [None]:
#@title Mount drive files
from google.colab import drive

drive.mount('/content/drive/')

In [None]:
#@title install libraries

!pip install transformers
!pip install hazm
!pip install -q clean-text[gpl]
!pip install stopwords_guilannlp


In [None]:
#@title Load libraries

import os
import pandas as pd
import numpy as np
import re
import tensorflow as tf
import plotly.express as px
import plotly.graph_objects as go

from tensorflow import keras
from cleantext import clean
from sklearn.model_selection import train_test_split

from hazm import *

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.models import Model, Sequential
from keras.layers import Dense, Embedding, Dropout
from keras.layers import GlobalMaxPool1D, MaxPooling1D, GlobalMaxPooling1D
from keras.layers.convolutional import Conv1D
from keras.metrics import categorical_accuracy, categ


In [None]:
#@title Define data cleaning/processing functions 

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext


def cleaning(text):
    text = text.strip()
    
    # regular cleaning
    text = clean(text,
        fix_unicode=True,
        to_ascii=False,
        lower=True,
        no_line_breaks=True,
        no_urls=True,
        no_emails=True,
        no_phone_numbers=True,
        no_numbers=False,
        no_digits=False,
        no_currency_symbols=True,
        no_punct=False,
        replace_with_url="",
        replace_with_email="",
        replace_with_phone_number="",
        replace_with_number="",
        replace_with_digit="0",
        replace_with_currency_symbol="",
    )

    # cleaning htmls
    text = cleanhtml(text)
    
    # normalizing
    normalizer = Normalizer()
    text = normalizer.normalize(text)
    
    # removing wierd patterns
    wierd_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u'\U00010000-\U0010ffff'
        u"\u200d"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\u3030"
        u"\ufe0f"
        u"\u2069"
        u"\u2066"
        # u"\u200c"
        u"\u2068"
        u"\u2067"
        "]+", flags=re.UNICODE)
    
    text = wierd_pattern.sub(r'', text)
    
    # removing extra spaces, hashtags
    text = re.sub("#", "", text)
    text = re.sub("\s+", " ", text)
    
    return text


def data_gl_than(data, less_than=100.0, greater_than=0.0,
                 col='description_len'):
    data_length = data[col].values

    data_glt = sum([1 for length in data_length if greater_than < length <= less_than])

    data_glt_rate = (data_glt / len(data_length)) * 100

    print(f'''Texts with word length of greater than {greater_than} and
           less than {less_than} includes {data_glt_rate:.2f}% of the whole!''')


def process_data(data, data_type='train',
                 remove_description=False) -> pd.DataFrame:

    if data_type == 'train':
      data['label'] = data['label'].astype(str)

      data = data.dropna(subset=['label'])
      data = data.dropna(subset=['description_fa'])
      data = data.reset_index(drop=True)

    data['description_len'] = data['description_fa'].apply(lambda t: len(word_tokenize(t)))
    min_max_len = data["description_len"].min(), data["description_len"].max()
    data_gl_than(data, maxlim, minlim)

    if remove_description:
      # remove comments with the length of fewer than three words
      data['description_len'] = data['description_len'].apply(lambda len_t: len_t if minlim < len_t <= maxlim else None)
      data = data.dropna(subset=['description_len'])
      data = data.reset_index(drop=True)

    data['cleaned_description'] = data['description_fa'].apply(cleaning)

    # calculate the length of comments based on their words
    data['cleaned_desc_len'] = data['cleaned_description'].apply(lambda t: len(word_tokenize(t)))

    if remove_description:
      # remove comments with the length of fewer than three words
      data['cleaned_desc_len'] = data['cleaned_desc_len'].apply(lambda len_t: len_t if minlim < len_t <= maxlim else len_t)
      data = data.dropna(subset=['cleaned_desc_len'])
      data = data.reset_index(drop=True)
    return data

In [None]:
#@title data processing

TEST_SIZE: int = 0.1
minlim, maxlim = 3, 320

train_data = pd.read_csv('/content/drive/MyDrive/datacamp/classification/train_set.zip')
test_data = pd.read_csv('/content/drive/MyDrive/datacamp/classification/test_set.zip')

train = process_data(train_data)
test = process_data(test_data, data_type='test')


In [None]:
#@title plot dataset
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=train['description_len']
))

fig.update_layout(
    title_text='Distribution of word counts within description',
    xaxis_title_text='Word Count',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()


fig = go.Figure()

groupby_rate = train.groupby('label')['label'].count()

fig.add_trace(go.Bar(
    x=list(sorted(groupby_rate.index)),
    y=groupby_rate.tolist(),
    text=groupby_rate.tolist(),
    textposition='auto'
))

fig.update_layout(
    title_text='Distribution of label within description',
    xaxis_title_text='Label',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

In [None]:
#@title clean documnet and tokenization

puncs = ['،', '.', ',', ':', ';', '"']
normalizer = Normalizer()
lemmatizer = Lemmatizer()

# turn a doc into clean tokens
def clean_doc(doc):
    doc = normalizer.normalize(doc) # Normalize document using Hazm Normalizer
    tokenized = word_tokenize(doc)  # Tokenize text
    tokens = []
    for t in tokenized:
      temp = t
      for p in puncs:
        temp = temp.replace(p, '')
      tokens.append(temp)
    # tokens = [w for w in tokens if not w in stop_set]    # Remove stop words
    tokens = [w for w in tokens if not len(w) <= 1]
    tokens = [w for w in tokens if not w.isdigit()]
    tokens = [lemmatizer.lemmatize(w) for w in tokens] # Lemmatize sentence words using Hazm Lemmatizer
    tokens = ' '.join(tokens)
    return tokens

In [None]:
#@title prepare data

NUM_WORDS:int = 2000
NUM_CLASSES: int = 10
tokenizer = Tokenizer(num_words=NUM_WORDS)

x_train = np.array(train['cleaned_description'])
y_train = np.array(train['label'])

x_test = np.array(test['cleaned_description'])

# Apply preprocessing step to training data and test data
train_docs = np.empty_like(x_train)
for index, document in enumerate(x_train):
  train_docs[index] = clean_doc(document)

test_docs = np.empty_like(x_test)
for index, document in enumerate(x_test):
  test_docs[index] = clean_doc(document)

tokenizer.fit_on_texts(train_docs)
max_length = max([len(s.split()) for s in train_docs])

# Embed and Pad embede training sequences
encoded_docs = tokenizer.texts_to_sequences(train_docs)
encoded_docs_test = tokenizer.texts_to_sequences(test_docs)

x_train_padded = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
x_test_padded = pad_sequences(encoded_docs_test, maxlen=max_length, padding='post')
vocab_size = len(tokenizer.word_index)

# Prepare input data to the model
categorical_y_train = to_categorical(y_train, NUM_CLASSES)
in_train, in_test, out_label, out_test = train_test_split(x_train_padded,
                                                          categorical_y_train,
                                                          test_size=TEST_SIZE,
                                                          random_state=42)

In [None]:
#@title Build model

from keras.optimizers import schedules, Adam
from keras.regularizers import l2

model_cnn = Sequential()
model_cnn.add(Embedding(vocab_size, 400, input_length=max_length))

model_cnn.add(Conv1D(filters=64, kernel_size=4, activation='relu',
                     padding='same', kernel_regularizer=l2(0.1)))
model_cnn.add(MaxPooling1D(pool_size=2))

model_cnn.add(Conv1D(filters=64, kernel_size=8, activation='relu',
                     padding='same', kernel_regularizer=l2(0.1) ))
model_cnn.add(MaxPooling1D(pool_size=2))

model_cnn.add(Conv1D(filters=64, kernel_size=16, activation='relu', 
                     padding='same', kernel_regularizer=l2(0.1)))
model_cnn.add(GlobalMaxPooling1D())

model_cnn.add(Dropout(0.5))
model_cnn.add(Dense(300, activation="sigmoid"))

# model_cnn.add(Dropout(0.4))
# model_cnn.add(Dense(100, activation="relu"))

model_cnn.add(Dense(NUM_CLASSES, activation='softmax'))

# lr_schedule = schedules.ExponentialDecay(
#                                         initial_learning_rate=4e-3,
#                                         decay_steps=2,
#                                         decay_rate=0.99999
#                                         )
opt = Adam(learning_rate=2e-3,
                            beta_1=0.9,
                            beta_2=0.999,
                            epsilon=1e-07,
                            amsgrad=False
                        )

model_cnn.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=[categorical_accuracy])

In [None]:
#@title Training

BATCH_SIZE =  64#@param {type:"number"}
EPOCHS =   8#@param {type:"number"}

# Train model
history_cnn = model_cnn.fit(in_train, out_label,
                         batch_size=BATCH_SIZE, epochs=EPOCHS,
                         validation_split=0.1,
                         shuffle=True)


In [None]:
model_cnn.save('my_model.h5')

In [None]:
#@title Model Evaluation

BATCH_SIZE = 64 #@param {type:"number"}

model_cnn.evaluate(
    x=in_test,
    y=out_test,
    batch_size=BATCH_SIZE
)

from sklearn.metrics import balanced_accuracy_score

# predicted_val = np.argmax(model_cnn.predict(in_test), axis=-1)
# balanced_accuracy_score(out_test, predicted_val)
out_test

In [None]:
#@title Predict test labels

BATCH_SIZE = 128 #@param {type:"number"}

model_cnn.predict(
    x_test_padded,
    batch_size=BATCH_SIZE
)

test_labels = np.argmax(model_cnn.predict(x_test_padded), axis=-1)

In [None]:
# test_labels = model_cnn.predict_classes(x_test_padded)
test_y = pd.DataFrame(test_labels, columns=['label'])
test_y['app_id'] = test_data['app_id']
test_y = test_y[['app_id', 'label']]
test_y.to_csv('prediction.csv', index=False)
test_y.head(10)