**Importing Packages**

In [1]:
#importing required libaries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re 
import string

import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

#keras
from keras import *
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.optimizers import  Adam
from keras import regularizers

**Tamil Dataset**

In [4]:
#reading the Tamil dataset
train=pd.read_csv('/content/drive/MyDrive/offensive language/Tamil dataset/Tamil-Codemixed_offensive_Training-Tweet.tsv',sep='\t', index_col=[0])
test=pd.read_csv('/content/drive/MyDrive/offensive language/Tamil dataset/Tamil_hasoc_tanglish_test_withlabels(1).tsv',sep='\t', index_col=[0]) 

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Removing punctuation**

In [6]:
import string
def remove_punctuations(txt):
    text_nopunc="".join([c for c in txt if c not in string.punctuation])
    return text_nopunc

train['Text']=train['Text'].apply(lambda x: remove_punctuations(x))
train

Unnamed: 0,Text,Category
TA_HL100,Iyaooo Kovam pattutene sothula visatha vachuru...,NOT
TA_HL101,Asha Apo neenga atha government ku theriya pad...,NOT
TA_HL102,Bala sundar ayyo sorryantha line ah explain pa...,NOT
TA_HL105,kalimuthu ne ena lusayaaru edhu panaalum en da...,NOT
TA_HL109,1st baby ku neat ah feed panunga plzz ipdi iru...,NOT
...,...,...
TA_TW6620,Yaroda body structure semaya irukum Sema mood ...,OFF
TA_TW3336,Yenda naangala politics varom nu pala varusham...,OFF
TA_HL1105,Yepdithan seruppala adichalum arasiyalvathikku...,OFF
TA_TW1915,USER Paithiyam ena unga vanthu full ah forward...,OFF


# **Spliting to Dev Dataset**

In [8]:
X_train, X_dev, y_train, y_dev = train_test_split(train['Text'], train['Category'], test_size=0.30, random_state=42)

X_test= test['Text']
y_test= test['Category']

# **Encoding**

In [9]:
Encoder = LabelEncoder()
y_train = Encoder.fit_transform(y_train)
y_test = Encoder.transform(y_test)
y_dev = Encoder.transform(y_dev)

# **Long Short Term Memory(LSTM)**

In [10]:
from keras.preprocessing.text import one_hot

#use onehot in train
voc_size = 1000

train_onehot = [one_hot(words, voc_size)for words in X_train]
dev_onehot = [one_hot(words, voc_size)for words in X_dev]
test_onehot = [one_hot(words, voc_size)for words in X_test]

In [11]:
#performing pad_sequences
from keras.preprocessing.sequence import pad_sequences

sent_length=100
X_train=pad_sequences(train_onehot,padding='pre',maxlen=sent_length)
X_dev=pad_sequences(dev_onehot,padding='pre',maxlen=sent_length)
X_test = pad_sequences(test_onehot,padding='pre',maxlen=sent_length)

In [12]:
dim=40
model=Sequential()

#embedding layer
model.add(Embedding(voc_size,dim,input_length=sent_length))

#input layer
model.add(LSTM(1000, input_shape=(1000,1), return_sequences=False))

#hidded layer
model.add(Dense(500, activation='relu', kernel_regularizer=regularizers.l2(0.01) ))

#output layer
model.add(Dense(5, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False),
              metrics=['accuracy'])

#model.compile('adam','mse')

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [13]:
#summary of LSTM model
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 40)           40000     
_________________________________________________________________
lstm (LSTM)                  (None, 1000)              4164000   
_________________________________________________________________
dense (Dense)                (None, 500)               500500    
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 2505      
Total params: 4,707,005
Trainable params: 4,707,005
Non-trainable params: 0
_________________________________________________________________


In [14]:
history = model.fit(X_train, y_train, 
                    batch_size=64,
                    epochs=10, validation_data=(X_dev, y_dev)                 
                    )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
#classified with test set
y_pred_test_LSTM = model.predict(X_test)

In [17]:
from sklearn import metrics

pred_LSTM = model.predict(X_test)
#predictions = np.around(predictions)
y_test_non_category = y_test
y_predict_non_category = [ np.argmax(t) for t in pred_LSTM ]

In [18]:
#classification report
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test_non_category, y_predict_non_category)) 
print(classification_report(y_test_non_category, y_predict_non_category, zero_division=0))

[[374  91]
 [181 294]]
              precision    recall  f1-score   support

           0       0.67      0.80      0.73       465
           1       0.76      0.62      0.68       475

    accuracy                           0.71       940
   macro avg       0.72      0.71      0.71       940
weighted avg       0.72      0.71      0.71       940



# **MLP**

In [20]:
#Simple Neural network
import keras
from keras.models import Sequential
from keras.layers import Dense

# Initialising
MLP = Sequential()

# Adding input layer and the first hidden layer
MLP.add(Dense(units = len(train.Category.value_counts()), kernel_initializer = 'uniform', activation = 'relu', input_dim = sent_length))

# Adding second hidden layer
MLP.add(Dense(units = len(train.Category.value_counts()), kernel_initializer = 'uniform', activation = 'relu'))

# Adding output layer
MLP.add(Dense(units = 2, kernel_initializer = 'uniform', activation = 'softmax'))

# Compiling the ANN
MLP.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

# Fitting the ANN to the Training set
MLP.fit(X_train, y_train, batch_size =50 , epochs = 10)

# Predicting the Test set results
y_pred = MLP.predict(X_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [23]:
from sklearn import metrics

pred_MLP = MLP.predict(X_test)

y_test_non_category = y_test
y_predict_non_category = [ np.argmax(t) for t in pred_MLP ]

In [24]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test_non_category, y_predict_non_category)) 
print(classification_report(y_test_non_category, y_predict_non_category, zero_division=0))

[[  0 465]
 [  0 475]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       465
           1       0.51      1.00      0.67       475

    accuracy                           0.51       940
   macro avg       0.25      0.50      0.34       940
weighted avg       0.26      0.51      0.34       940

