In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import string

from tensorflow.keras.models import Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout ,GlobalAveragePooling1D

from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/feedback-prize-effectiveness/train.csv')
test = pd.read_csv('../input/feedback-prize-effectiveness/test.csv')
sample_submission = pd.read_csv('../input/feedback-prize-effectiveness/sample_submission.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
sample_submission.head()

In [None]:
train.shape, test.shape

In [None]:
train.info()

In [None]:
test.info()

In [None]:
print(train['discourse_type'].value_counts())
print()
print(train['discourse_effectiveness'].value_counts())

In [None]:
# Graphs ....

In [None]:
train['Adequate'] = pd.get_dummies(train['discourse_effectiveness'])['Adequate']
train['Effective'] = pd.get_dummies(train['discourse_effectiveness'])['Ineffective']
train['Ineffective'] = pd.get_dummies(train['discourse_effectiveness'])['Ineffective']

In [None]:
train.head()

In [None]:
train['discourse_text'][10]

In [None]:
len(train['discourse_text'][10])

In [None]:


def custom_standardization(input_data):
  text = tf.strings.lower(input_data)

  #removing square brackets  
  text = tf.strings.regex_replace(text,'\[.*?\]', '')

  #removing puncuation
  text = tf.strings.regex_replace(text,'[%s]' % re.escape(string.punctuation), '')
  text = tf.strings.regex_replace(text ,'\n' , '')

  #remove words containing numbers
  text = tf.strings.regex_replace(text ,'\w*\d\w*' , '')


  return tf.strings.regex_replace(text,'[%s]' % re.escape(string.punctuation),'')

In [None]:
custom_standardization(train['discourse_text'][1])

In [None]:
max_features = 10000 # no of word in vocab
sequence_length = 500

In [None]:
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length
)

In [None]:
print(np.array(vectorize_layer.get_vocabulary()))
print(len(np.array(vectorize_layer.get_vocabulary())))

In [None]:
train['discourse_text'].values

In [None]:
vectorize_layer.adapt(train['discourse_text'].values)

In [None]:
vectorize_text = vectorize_layer(train['discourse_text'].values)
vectorize_text

In [None]:
columns_to_br_reomve = ['discourse_id','essay_id','discourse_type','discourse_effectiveness']
train = train.drop(columns_to_br_reomve, axis=1)

In [None]:
train.head()

In [None]:
y = train.drop(['discourse_text'],axis=1)
X = vectorize_text

In [None]:
embedding_dim = 64

model = tf.keras.Sequential([
  Embedding(max_features + 1, embedding_dim),
  Dropout(0.2),
  GlobalAveragePooling1D(),
  Dropout(0.2),
  Dense(3, activation='sigmoid')
])

model.summary()

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
history = model.fit(X,y,epochs=3,verbose=1)

In [None]:
x_test = vectorize_layer(test['discourse_text'].values)
Predict = model.predict(x_test)
Predict

In [None]:
prediction = pd.DataFrame(Predict, columns=["Ineffective", "Adequate", "Effective"])

In [None]:
prediction["discourse_id"] = sample_submission["discourse_id"] 
titles = ['discourse_id','Ineffective', 'Adequate', 'Effective']
prediction = prediction.reindex(columns = titles)
prediction

In [None]:
submission = pd.DataFrame(prediction)
submission.to_csv('submission.csv', index = False)
print("My competition submission: \n\n", submission)