In [1]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving diabetes.csv to diabetes.csv
Saving youtube_comments.csv to youtube_comments.csv
User uploaded file "diabetes.csv" with length 23873 bytes
User uploaded file "youtube_comments.csv" with length 445205 bytes


In [2]:
import random
import re
import io

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from tensorflow import keras

In [6]:
names = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "Outcome"]
df = pd.read_csv(io.StringIO(uploaded['diabetes.csv'].decode('utf-8')))
df = df.fillna(df.mean())
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:
# 1
for column in names[1:]:
    df[column] = (df[column] - df[column].mean())/df[column].std()
Y = df.iloc[:, 0].values.reshape(-1, 1)
Y = OneHotEncoder().fit_transform(Y).toarray()
X = df.iloc[:, 1:].values
num_samples = X.shape[0]
train_samples = int(0.75*num_samples)
indexes = np.arange(num_samples)
random.shuffle(indexes)
X, Y = X[indexes], Y[indexes]
X_train, Y_train = X[:train_samples, :], Y[:train_samples]
X_test, Y_test = X[train_samples:, :], Y[train_samples:]
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(576, 8) (576, 17)
(192, 8) (192, 17)


In [15]:
inputs = keras.Input(shape=(8,))
x = keras.layers.Dense(64, activation="relu")(inputs)
outputs = keras.layers.Dense(17, activation="softmax")(x)
fc_model = keras.Model(inputs=inputs, outputs=outputs, name="fc_model")
fc_model.summary()

Model: "fc_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 8)]               0         
_________________________________________________________________
dense_8 (Dense)              (None, 64)                576       
_________________________________________________________________
dense_9 (Dense)              (None, 17)                1105      
Total params: 1,681
Trainable params: 1,681
Non-trainable params: 0
_________________________________________________________________


In [16]:
fc_model.compile(
    loss=keras.losses.CategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.RMSprop(),
    metrics=["accuracy"],
)
fc_model.fit(X_train, Y_train, batch_size=8, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f7a6c5a44a8>

In [18]:
test = fc_model.evaluate(X_test, Y_test, verbose=2)
print("Loss test:", test[0])
print("Accuracy test:", test[1])

6/6 - 0s - loss: 2.2913 - accuracy: 0.2083
Loss test: 2.2912962436676025
Accuracy test: 0.2083333283662796


In [32]:
# 2
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
print(x_train.shape)
inputs = keras.Input(shape=(28, 28, 1))
x = keras.layers.Conv2D(64, 3, activation="relu")(inputs)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.Flatten()(x)
outputs = keras.layers.Dense(10)(x)
cnn_model = keras.Model(inputs=inputs, outputs=outputs, name="cnn_model")
cnn_model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
(60000, 28, 28)
Model: "cnn_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_14 (InputLayer)        [(None, 28, 28, 1)]       0         
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 26, 26, 64)        640       
_________________________________________________________________
batch_normalization_6 (Batch (None, 26, 26, 64)        256       
_________________________________________________________________
flatten_6 (Flatten)          (None, 43264)             0         
_________________________________________________________________
dense_16 (Dense)             (None, 10)                432650    
Total params: 433,546
Trainable params: 433,418
Non-trainable params: 128
_________________________________________________________________


In [33]:
cnn_model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.RMSprop(),
    metrics=["accuracy"],
)
# training
cnn_model.fit(x_train, y_train, batch_size=256, epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f7a6a694898>

In [35]:
test = cnn_model.evaluate(x_test, y_test, verbose=2)
print("Loss test:", test[0])
print("Accuracy test:", test[1])

313/313 - 2s - loss: 0.2269 - accuracy: 0.9714
Loss test: 0.226945698261261
Accuracy test: 0.9714000225067139


In [34]:
# 3
df = pd.read_csv(io.StringIO(uploaded['youtube_comments.csv'].decode('utf-8')))
df = df.fillna(df.mean())
df.head()

Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [38]:
import nltk
nltk.download('stopwords')

def remove_stopwords(str_x):
    words = str_x.split(' ')
    neutral_words = ['edit', 'got', 'thing', 'want', 'make', 'is', 'do']
    new_words = list()
    for word in words:
        if word not in stopwords.words('english') or word not in neutral_words:
            new_words.append(word)
    return ' '.join(new_words)

df["comment_text"] = df["comment_text"].map(lambda x: re.sub(r'[^\w]', ' ', x))
df["comment_text"] = df["comment_text"].map(lambda x: x.lower())
df["comment_text"] = df["comment_text"].map(remove_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [39]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["comment_text"].values).toarray()
Y = df.iloc[:, 3:].values
num_samples = X.shape[0]
train_samples = int(0.75*num_samples)
indexes = np.arange(num_samples)
random.shuffle(indexes)
X, Y = X[indexes], Y[indexes]
x_train, y_train = X[:train_samples, :], Y[:train_samples]
x_test, y_test = X[train_samples:, :], Y[train_samples:]
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(750, 9372) (750, 6)
(250, 9372) (250, 6)


In [42]:
inputs = keras.Input(shape=(9372, 1))
x = keras.layers.LSTM(128)(inputs)
outputs = keras.layers.Dense(6, activation="softmax")(x)
rnn_model = keras.Model(inputs=inputs, outputs=outputs, name="rnn_model")
rnn_model.summary()

Model: "rnn_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_16 (InputLayer)        [(None, 9372, 1)]         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               66560     
_________________________________________________________________
dense_18 (Dense)             (None, 6)                 774       
Total params: 67,334
Trainable params: 67,334
Non-trainable params: 0
_________________________________________________________________


In [44]:
rnn_model.compile(
    loss=keras.losses.CategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.RMSprop(),
    metrics=["accuracy"],
)

rnn_model.fit(x_train, y_train, batch_size=4, epochs=1, validation_split=0.2)



<tensorflow.python.keras.callbacks.History at 0x7f7a6c5c49e8>

In [46]:
test = rnn_model.evaluate(x_test, y_test, verbose=2)
print("Loss test:", test[0])
print("Accuracy test:", test[1])

8/8 - 16s - loss: 0.2473 - accuracy: 1.0000
Loss test: 0.24734418094158173
Accuracy test: 1.0
