## Method 1

In [17]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [51]:
df = pd.read_csv('./Datasets/train_comment_small_100.csv', sep=',')

In [54]:
def text_cleaning(text):
    
    text = re.sub('<[^<]+?>', ' ', text)
    text = text.replace('\\"', '').replace('\n', '')
    text = text.replace('"', '')
    return text

In [55]:
df = df.dropna()
df['cleaned_comment_text'] = df['comment_text'].apply(text_cleaning)

In [57]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_comment_text'], df['toxic'], test_size=0.2)

In [58]:
vectorizer = CountVectorizer(binary=True, stop_words=stopwords.words('english'),
                            lowercase=True, min_df=3, max_df=0.2, max_features=5000)
X_train_onehot = vectorizer.fit_transform(X_train)

In [59]:
from keras.layers import Dense
from keras.models import Sequential

In [60]:
nn = Sequential()
nn.add(Dense(units=500, activation='relu', input_dim = len(vectorizer.get_feature_names())))
nn.add(Dense(units=1, activation='sigmoid'))

In [61]:
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [62]:
nn.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 500)               57500     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 501       
Total params: 58,001
Trainable params: 58,001
Non-trainable params: 0
_________________________________________________________________


In [63]:
nn.fit(X_train_onehot[:-40], y_train[:-40], epochs=10, batch_size=128, verbose=1,
      validation_data=(X_train_onehot[-40:], y_train[-40:]))

Train on 39 samples, validate on 40 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7ff251800940>

In [64]:
scores = nn.evaluate(vectorizer.transform(X_test), y_test, verbose=1)
print('Accuracy:', scores[1])

Accuracy: 1.0


## Method 2

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
df = pd.read_csv('./Datasets/train_comment_small_100.csv')

In [None]:
df = df.dropna()

In [None]:
corpus = []
for i in range(0, df.shape[0]):
    
    review = re.sub('[^a-zA-Z]', ' ', df['comment_text'][i])
    review = review.lower()
    review = review.split()
#     ps = PorterStemmer()
#     review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv =  CountVectorizer(max_features=5000)

In [None]:
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, 1]
y1 = y[:99]

In [None]:
from sklearn import preprocessing
labelencoder_y = preprocessing.LabelEncoder()
y = labelencoder_y.fit_transform(y1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_test = sc.transform(X_test)

In [None]:
classifier = Sequential()
classifier.add(Dense(output_dim=20, init='uniform', activation='relu', input_dim=len(cv.get_feature_names())))
classifier.add(Dense(output_dim=20, init='uniform', activation='relu'))
classifier.add(Dense(output_dim=1, init='uniform', activation='softmax'))
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
classifier.fit(X_train, y_train, batch_size=128, nb_epoch=5)

In [None]:
y_pred = classifier.predict(X_test)
scores = classifier.evaluate(X_test, y_test, verbose=1)
print('Accuracy', scores[0])

In [None]:
from sklearn.metrics import confusion_matrix
cm  = confusion_matrix(y_test, y_pred)
cm