In [None]:
%%capture
!pip install tensorflow_text==2.6.0

In [None]:
import os
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [None]:
from tqdm import tqdm
import numpy as np
import pandas as pd 
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import plotly.express as px

from numpy import newaxis
from wordcloud import WordCloud, STOPWORDS

from tqdm import tqdm

from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

import xgboost as xgb
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Bidirectional, Activation, GRU, BatchNormalization
from tensorflow.keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence, text
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

from tensorflow.keras.optimizers import Adam

%matplotlib inline
sns.set(style='whitegrid', palette='muted', font_scale=1.2)

plt.rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 42

nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
module_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3'
use = hub.load(module_url)

In [None]:
df_toxic_reviews = pd.read_csv("/content/drive/MyDrive/train.csv.zip")
df_toxic_reviews


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [None]:
df_toxic_reviews.drop(df_toxic_reviews.index[10000:159570], inplace=True)
df_toxic_reviews

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
9995,1a790ff1007a10e3,Numbers may be either listed separately at the...,0,0,0,0,0,0
9996,1a7a4868968e2b9e,"Those two love to disagree, don't they? 206.17...",0,0,0,0,0,0
9997,1a7c3bec9a71415d,"""I have changed """"Lance Thomas"""" to """"Lance Th...",0,0,0,0,0,0
9998,1a7c9c14b0cf0fe0,states \n\nCourts: I have been putting all art...,0,0,0,0,0,0


In [None]:
df = df_toxic_reviews.drop(['id','severe_toxic','obscene','threat','insult','identity_hate'],axis=1)
df

Unnamed: 0,comment_text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0
...,...,...
9995,Numbers may be either listed separately at the...,0
9996,"Those two love to disagree, don't they? 206.17...",0
9997,"""I have changed """"Lance Thomas"""" to """"Lance Th...",0
9998,states \n\nCourts: I have been putting all art...,0


In [None]:
df["review_type"]=df['toxic'].apply(lambda x: "good" if x == 0 else "bad")
df.rename(columns={'comment_text': 'review'},inplace=True)
df

Unnamed: 0,review,toxic,review_type
0,Explanation\nWhy the edits made under my usern...,0,good
1,D'aww! He matches this background colour I'm s...,0,good
2,"Hey man, I'm really not trying to edit war. It...",0,good
3,"""\nMore\nI can't make any real suggestions on ...",0,good
4,"You, sir, are my hero. Any chance you remember...",0,good
...,...,...,...
9995,Numbers may be either listed separately at the...,0,good
9996,"Those two love to disagree, don't they? 206.17...",0,good
9997,"""I have changed """"Lance Thomas"""" to """"Lance Th...",0,good
9998,states \n\nCourts: I have been putting all art...,0,good


In [None]:
fig = px.histogram(df, x="review_type", title='Review Type Distribution', text_auto=True)
fig.show()


In [None]:
good_reviews = df[df.review_type == "good"]
bad_reviews = df[df.review_type == "bad"]


In [None]:
good_reviews_text = " ".join(good_reviews.review.to_numpy().tolist())
bad_reviews_text = " ".join(bad_reviews.review.to_numpy().tolist())

In [None]:
good_df = good_reviews.sample(n=len(bad_reviews), random_state=RANDOM_SEED, replace = True)

df_review_resampled = good_df.append(bad_reviews).reset_index(drop=True)
df_review_resampled.shape


(1942, 3)

In [None]:
df_review_resampled


Unnamed: 0,review,toxic,review_type
0,"""|decline=You don't seem to understand the con...",0,good
1,I fixed the link. It took all of about 3 secon...,0,good
2,"""\n\n NRHP potential \n\nNoticed you are doing...",0,good
3,"Eleemosynary, you're clearly a biased editor. ...",0,good
4,"""\n\nAs far as Yates....the whole Yates thing ...",0,good
...,...,...,...
1937,Ahah! The looser is still complaining me just ...,1,bad
1938,Dude this guy is so gay.,1,bad
1939,"""==She is doing porn now!!!!!==\nsimply go to ...",1,bad
1940,"""\n\n Up to your old tricks I see \n\nThankful...",1,bad


In [None]:
label_enc = preprocessing.LabelEncoder()
encoded_review = label_enc.fit_transform(df_review_resampled.review_type.values)

In [None]:
train_reviews, test_reviews, y_train, y_test = train_test_split(
    df_review_resampled.review, 
    encoded_review, 
    test_size=0.25, 
    random_state=RANDOM_SEED
  )

In [None]:
X_train = []
for r in tqdm(train_reviews):
    emb = use(r)
    review_emb = tf.reshape(emb, [-1]).numpy()
    X_train.append(review_emb)

X_train = np.array(X_train)


100%|██████████| 1456/1456 [03:57<00:00,  6.12it/s]


In [None]:
X_test = []
for r in tqdm(test_reviews):
    emb = use(r)
    review_emb = tf.reshape(emb, [-1]).numpy()
    X_test.append(review_emb)

X_test = np.array(X_test)


100%|██████████| 486/486 [01:25<00:00,  5.67it/s]


In [None]:
print(X_train.shape, X_test.shape)


(1456, 512) (486, 512)


In [None]:
print(y_train.shape, y_test.shape)


(1456,) (486,)


In [None]:
X_train_reshaped = X_train[:, newaxis,:]
X_test_reshaped = X_test[:, newaxis,:]


In [None]:
X_train_reshaped.shape


(1456, 1, 512)

In [None]:
1,X_train_reshaped.shape[2]


(1, 512)

In [None]:
def build_model_lstm_2():
    model = Sequential()
    model.add(LSTM(256, activation='relu', return_sequences=True,
                 input_shape=(1,X_train_reshaped.shape[2])
                 ))
    model.add(LSTM(128, dropout=0.2, activation='relu', return_sequences=True))
    model.add(LSTM(64, dropout=0.2, activation='relu',  return_sequences=True))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
            metrics=['accuracy'],
            optimizer=Adam(learning_rate=0.0005))
    return model

In [None]:
model_lstm_2 = build_model_lstm_2()


In [None]:
%%time
history = model_lstm_2.fit(
    X_train_reshaped, y_train, 
    epochs=10, 
    batch_size=16, 
    validation_split=0.1, 
    verbose=1, 
    shuffle=True
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 28.3 s, sys: 8.65 s, total: 36.9 s
Wall time: 45.7 s


In [None]:
model_lstm_2.evaluate(X_test_reshaped, y_test)




[0.46191123127937317, 0.8518518805503845]

In [None]:
model_lstm_2.save("/content/drive/MyDrive/model1.h5")

In [None]:
print(use(text))

tf.Tensor(
[[ 0.01407899 -0.03945087  0.02271953  0.04180859  0.09394487 -0.02589601
  -0.04134088  0.0539793   0.0390438   0.05521133 -0.00673443  0.03615097
  -0.03455393 -0.05423264 -0.05371225 -0.00300766  0.0249855   0.08600383
   0.00111257  0.02411407 -0.04326477 -0.03045344 -0.00482911  0.02431514
   0.04649521  0.05267535 -0.0141102  -0.00050071  0.04340602 -0.04011422
  -0.08771198  0.04500392 -0.01147889  0.00407531 -0.02030614 -0.05575892
  -0.08626752 -0.09791625  0.02717641 -0.01532246  0.07460616 -0.03253679
  -0.04225469  0.03342643 -0.00567515 -0.03237273 -0.04600116  0.02082347
   0.0225313  -0.0207896  -0.02519247  0.00872947 -0.07531843 -0.01162797
  -0.0368044  -0.02891224  0.08491161 -0.00193501  0.06602315 -0.01287555
   0.01000101 -0.00796193 -0.01986385 -0.0204124  -0.07634273  0.02042953
   0.00065007  0.02294465 -0.02863895  0.05058826 -0.00653607 -0.00559826
  -0.05194468  0.01672265  0.00499019  0.00701351 -0.01038735 -0.02275423
  -0.03466718  0.00605941 -