<a href="https://colab.research.google.com/github/sailkargutkar/R-Projects/blob/HELBNTODR100369/HELBNTODR100369.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
!pip install scattertext

In [30]:
!pip install "git+https://github.com/facebookresearch/fastText.git"

In [31]:
import fasttext.util
import numpy as np
import pandas as pd
import re
import scattertext as st
import spacy

from keras import layers
from keras.layers import Dropout 
from keras.models import Sequential
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from scattertext import CorpusFromPandas, produce_scattertext_explorer
from sklearn.model_selection import train_test_split

In [32]:
df = pd.read_csv('/content/agr_en_train.csv', names=['unique_id','text','aggression-level'], sep=',')
print(df.iloc[0])

In [34]:
df.isna().values.any()

DefaultCredentialsError: ignored

In [None]:
df['aggression-level'].value_counts() 

In [None]:
nlp = spacy.load('en')
df['parsed'] = df.text.apply(nlp)
data = st.CorpusFromParsedDocuments(df, category_col='aggression-level', 
                                      parsed_col='parsed').build().remove_terms(nlp.Defaults.stop_words, ignore_absences=True)

freq_df = data.get_term_freq_df()
oag_tw = freq_df.sort_values(by=['OAG freq'], ascending=False)
oag_tw = oag_tw.drop(oag_tw.columns[[1,2]], axis=1)
nag_tw = freq_df.sort_values(by=['NAG freq'], ascending=False)
nag_tw = nag_tw.drop(nag_tw.columns[[0,2]], axis=1)
cag_tw = freq_df.sort_values(by=['CAG freq'], ascending=False)
cag_tw = cag_tw.drop(cag_tw.columns[[0,1]], axis=1)

print(oag_tw.head())
print(nag_tw.head())
print(cag_tw.head())

In [None]:
df['aggression-level'] = df['aggression-level'].replace({ 'OAG' : 0, 'NAG' : 1, 'CAG' : 2 }) 
labels = df['aggression-level'].values
labels = to_categorical(labels, num_classes = 3)

In [None]:
fasttext.util.download_model('en', if_exists='ignore') 
ft = fasttext.load_model('cc.en.300.bin')

In [None]:
review_length = 100
data_count = len(df)
dims = ft.get_dimension()

In [None]:
def text_to_vector(text):

  text = text.replace('&', ' and ')
  text = text.replace('@', ' at ')
  text = re.sub(r'[^\x41-\x7f]',r' ',text)
  text = text.lower().split()

  window = text[-review_length:]
  
  vectors = np.zeros((review_length, dims))

  for i, word in enumerate(window):
      vectors[i, :] = ft.get_word_vector(word).astype('float32')

  return vectors


In [None]:
def create_word_embedding(df):

    word_embedding = np.zeros((len(df), review_length, dims), dtype='float32')

    for i, review in enumerate(df['text'].values):
        word_embedding[i, :] = text_to_vector(review)

    return word_embedding

In [None]:
embedding = create_word_embedding(df)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(embedding, labels, test_size=0.20, random_state=42)

In [None]:
def cnn_text_classifier():

    model = Sequential()
    model.add(layers.Conv1D(128, 5, activation='relu', input_shape=(review_length, dims)))
    model.add(layers.GlobalAveragePooling1D())
    model.add(Dropout(0.5))
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(3, activation='sigmoid'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    return model

In [None]:
model = cnn_text_classifier()
modHistory = model.fit(X_train, y_train, epochs=10, verbose=False, validation_data=(X_test, y_test), batch_size=10)

In [None]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, precision_score

y_pred = model.predict(X_test)

y_pred = np.argmax(y_pred, axis=1)
y_pred2 = np.argmax(y_test, axis=1)

cm=confusion_matrix(y_pred2,y_pred)
print(cm)

score = f1_score(y_pred2,y_pred,average="micro")
print("F1 score : ", score)

In [None]:
# model accuracy and loss graphs
import matplotlib.pyplot as plt

plt.plot(modHistory.history['accuracy'])
plt.plot(modHistory.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show() #plotting the accuracy  vs the number of epochs

plt.plot(modHistory.history['loss'])
plt.plot(modHistory.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show() #plotting the loss vs the number of epochs