## Machathon 2.0 Qualification Round
### Arabic Poetry Meter

In [None]:
import os
import io
import glob
import numpy as np
import pandas as pd
import tensorflow as tf
from random import shuffle
from pyarabic import araby
from sklearn.utils import shuffle
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import GRU, Embedding, Dense, Input, Dropout, Bidirectional
!pip install pyarabic

### Reading Data

In [None]:
df=pd.read_csv('../input/poemdatasett/train.csv',header=0)
df=pd.read_csv(io.StringIO(u""+df.to_csv(header=None,index=False)), header=None)
df.to_csv("trainFile.txt", header=None, index=None, sep=' ', mode='w')

In [None]:
with open('../input/poemdatasett/labels.txt', 'r') as f:
    data_labels = f.readlines()
    data_labels = [name.replace('\n', '') for name in data_labels]

### Preprocessing Arabic Data

In [None]:
def splitting(path, thresh = 70, on_shatrs = False):
    
    global ALL_WORDS
    words = ""
    X = []
    y = []
    file = open(path, 'r').read()
    file = araby.strip_tashkeel(file)
    removed = '!()*-ـ.:=o[]«»;؛,،~?؟\u200f\ufeffـ'
    after_R = ""
    for char in file:
        if char not in removed:
            after_R += char
    words += after_R
    splitted = after_R.split('\n')
    for line in splitted:
        if len(line) <= 1:
            continue
        data_label, splitted = line.split(' ', 1)
        data_label = int(data_label)
        splitted  = splitted.strip()
        if on_shatrs:
            shatrs = splitted.split('#')
            for shatr in shatrs:
                X.append(shatr.strip())
                y.append(data_label)
        else:
            X.append(splitted.strip())
            y.append(data_label)
    ALL_WORDS = sorted(set(' '.join(X)))  
    X, y = shuffle(X, y)
        
    return X, y

In [None]:
X, y = splitting("./trainFile.txt", on_shatrs=False)

### Showing First Five Abyat

In [None]:
for i in range(5):
    print(X[i], ' ', data_labels[y[i]])

### Splitting to train & validation data

In [None]:
X_train, X_validation , y_train, y_validation = train_test_split(X, y, test_size = 0.18, random_state = 30)

### Converting Arabic Characters to Index

In [None]:
convert_char_idx = {u:i+1 for i, u in enumerate(ALL_WORDS)}

### Making Sequences

In [None]:
def padding(X):
    X = [[convert_char_idx[char] for char in line] for line in X]
    X = pad_sequences(X, padding='post', value=0, maxlen = 100)
    return X

In [None]:
X_train = padding(X_train)
X_validation = padding(X_validation)
y_train = np.array(y_train)
y_validation = np.array(y_validation)

### Building The Model

In [None]:
classifier = Sequential()
classifier.add(Input((100,)))
classifier.add(Embedding(len(convert_char_idx)+1, 256))
classifier.add(Bidirectional(GRU(units = 512, return_sequences=True)))
classifier.add(Bidirectional(GRU(units = 256, return_sequences=True)))
classifier.add(Bidirectional(GRU(units = 256, return_sequences=True)))
classifier.add(Bidirectional(GRU(units = 256)))
classifier.add(Dense(64, activation = 'relu'))
classifier.add(Dropout(0.3))
classifier.add(Dense(len(data_labels), activation = 'softmax'))
classifier.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

In [None]:
classifier.summary()

In [None]:
classifier(tf.zeros((10, 100))).shape

In [None]:
CB = [tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, min_delta=0.0001, min_lr=0.0001)]
CB += [tf.keras.callbacks.ModelCheckpoint('MAAN_Model', monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')]

### Training The Model

In [None]:
classifier.fit(X_train, y_train, validation_data= (X_validation, y_validation), epochs = 17, batch_size= 64, shuffle = True, callbacks=CB)

### Saving The Model

In [None]:
classifier.save('MAAN_Model_Final.h5')

### Loading Model

In [None]:
classifier = tf.keras.models.load_model('MAAN_Model_Final.h5')

### Classifier's Preprocessing Function

In [None]:
def classifier_preprocess_data(text, thresh = 70, on_shatrs = False):
    
    words = ""
    X = []
    text = araby.strip_tashkeel(text)
    removed = 'ـ!()*-ـ.:=o[]«»;؛,،~?؟\u200f\ufeffـ'
    after_R = ""
    for char in text:
        if char not in removed:
            after_R += char
    words += after_R
    splitted = after_R
    splitted  = splitted.strip()
    if on_shatrs:
        shatrs = splitted.split('#')
        for shatr in shatrs:
            X.append(shatr.strip())
    else:
        X.append(splitted.strip())
    X = X[0]
    return X 

In [None]:
data_labels = {i:name for i, name in enumerate(data_labels)}

### Prediction Function

In [None]:
def main_classify_func(sentence):
    sentence = classifier_preprocess_data(sentence, on_shatrs=False)
    sequence = [convert_char_idx[char] for char in sentence]
    sequence = pad_sequences([sequence], maxlen = X_train.shape[1], padding='post', value=0)
    pred = classifier.predict(sequence)[0]
    print(data_labels[np.argmax(pred, 0).astype('int')], np.max(pred))
    pred = np.argmax(pred, 0).astype('int')
    return pred

### Reading and Classifying Test Data

In [None]:
test_data_df=pd.read_csv('../input/finaltestdata/test (2).csv')
main_classify_func(test_data_df.data[0])

In [None]:
test_data_dictionary = { i: main_classify_func(test_data_df.data[i]) for i in range(test_data_df.shape[0])}
list(test_data_dictionary.keys())[0]

In [None]:
test_data_df =  pd.DataFrame(test_data_dictionary.items(), columns=['id', 'labels'])
test_data_df.head()

### Saving Final Predictions to CSV File

In [None]:
test_data_df.to_csv('MAAN_Final.csv', header=True, index=False)