In [1]:
# importing all the needed standard libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import tensorflow as tf
import seaborn as sns
sns.set(style="darkgrid")
import sklearn.metrics
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

In [None]:
# mounting google drive to google colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# loading training and testing data
train_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/train.csv')
test_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/test.csv')
train_df.shape
# train_df[:50]

In [None]:
# Splitting the training data into 90% for training and 10% for validation
training_data, validation_data = train_test_split(train_df, test_size=0.1, random_state=25)
print(training_data.shape,validation_data.shape)
print(training_data)

In [None]:
# # find a baseline model
def disaster_baseline(text):
    if "#" in text or "http" in text:
        return 1
    else:
        return 0

# for training data
train_preds = [disaster_baseline(x) for x in training_data['text']]
print('accuracy', sklearn.metrics.accuracy_score(training_data['target'],train_preds))
print('f1', sklearn.metrics.f1_score(training_data['target'],train_preds))
print(sklearn.metrics.classification_report(training_data['target'],train_preds))

# for validation data
val_preds = [disaster_baseline(x) for x in validation_data['text']]
print('accuracy', sklearn.metrics.accuracy_score(validation_data['target'],val_preds))
print('f1', sklearn.metrics.f1_score(validation_data['target'],val_preds))
print(sklearn.metrics.classification_report(validation_data['target'],val_preds))

In [None]:
#Output CSV
def output_kaggle_test(preds):
  print("Hi")
  kaggle_df = test_df[['id']].copy()
  kaggle_df['target'] = preds
  kaggle_df.to_csv('baseline.csv', index=False)
  display(kaggle_df)

# preds = [disaster_baseline(x) for x in test_df['text']]
# # output_kaggle_test(preds)

The Kaggle score is: 0.60435

# **VISUALIZATION**

In [None]:
# Created at a histogram of train set predictions.
plt.hist(train_df['target'])
plt.show()
# From this we can see that the majority of the class is non-disaster.

In [None]:
# Created two lists for the length of text in each class
disaster_lengths = [len(text) for text in train_df[train_df['target'] == 1]['text']]
non_disaster_lengths = [len(text) for text in train_df[train_df['target'] == 0]['text']]

# Plotted the histograms
plt.hist(disaster_lengths, bins=50, alpha=0.7, label='Disaster Tweets', density=True)
plt.hist(non_disaster_lengths, bins=50, alpha=0.5, label='Non-Disaster Tweets', density=True)
plt.xlabel('Length of Tweet')
plt.ylabel('Frequency')
plt.legend(loc='upper left')
plt.show()


# here we tried to see if we could predict the class based on the length of the tweet.
# We see that as the length of the tweet increases the tweet becomes more of a non disaster

In [None]:
import collections

# created a list of words from the test that are disaster
disaster_words = ' '.join(train_df[train_df['target']==1]['text']).split()

# created a set of words from the disaster words
set_disaster_words = set(disaster_words)

# created a list of words from the test that are not disaster
non_disaster_words = ' '.join(train_df[train_df['target']==0]['text']).split()

# created a set of words from the non disaster words
set_non_disaster_words = set(non_disaster_words)

# created a set of words that are only present in disaster words
only_disaster_word = set_disaster_words - set_non_disaster_words

# created a dictionary of words with frequency
disaster_word_counts = collections.Counter(disaster_words)

# created a dictionary of words thats only in disaster words with frequency
only_disaster_word_freq = {}
for word in only_disaster_word:
  only_disaster_word_freq[word] = disaster_word_counts[word]
# sorted the only_disaster_word_freq so that we can display the top 20 words in the bar chart
sorted_only_disaster_word_freq = sorted(only_disaster_word_freq.items(), key=lambda x: x[1], reverse=True)
print(sorted_only_disaster_word_freq)

#the top 20 most frequent disaster words and their frequency.
top_words = sorted_only_disaster_word_freq[:20]
words = [word[0] for word in top_words]
counts = [word[1] for word in top_words]

# Plotted a bar chart of the top 20 most frequent disaster words that are not present in the non-disaster tweets.
plt.bar(words, counts)
plt.title('Top 10 Most Frequent Disaster Words')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=90)
plt.show()

# Below we can see the 20 most frequent words that are not present in the non-disaster tweets.

In [None]:
# we create a word_index that ordered by the freqquency they occur where each words map to the frequency in the dataset
# all words in the dataset.
all_words = ' '.join(train_df['text']).split()
print(len(all_words))
# frequency of each word.
word_counts = {}
for word in all_words:
    word = word.lower()
    if word in word_counts:
        word_counts[word] += 1
    else:
        word_counts[word] = 1

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=None, oov_token='<OOV>', lower=True)
tokenizer.fit_on_texts(all_words)

word_index = tokenizer.word_index
print(word_index)
# Because we used index_from=3 (above), setting aside ids below 3 for special
# symbols, we need to add 3 to the index values.
word_counts = dict([(key, value+3) for (key, value) in word_counts.items()])
word_counts['<START>'] = 1  # start of input
word_counts['#'] = 2       # out-of-vocabulary (OOV)
word_counts['<UNUSED>'] = 3
# Sort words by frequency
print(word_counts)

max_id = max(word_counts.values())
print('Largest ID:', max_id)

In [None]:
def encode(text):
  # Split the input text into individual words.
  words = text.lower().split()

  # Look up the integer value of each word in the index dictionary.
  # If a word is not in the dictionary, use the integer value for the "<OOV>" token.
  token_ids = [word_index.get(word, 1) for word in words]

  return token_ids
# Show the ids corresponding tokens in the first example.
encoded_tweets = [encode(x) for x in train_df['text']]
print(train_df['text'][1])
print(encode(train_df['text'][1]))
# print(encoded_tweets)

In [None]:
train_data = train_df[['text', 'target']]
train_data['text'] = train_data['text'].apply(encode)
# print(train_data)
# split the data
training_data, validation_data = train_test_split(train_data, test_size=0.1, random_state=25)
print(validation_data)

In [None]:
# As is clear from the length histogram, the current representation of the review text is a variable-length
# Since fixed-length arrays are easier to work with in Tensorflow, let's add special padding tokens at the end of
# each review until they are all the same length.

# We'll also use this operation to limit the number of token positions by truncating all reviews to a specified length.
# In the code below, as an example, we pad all training inputs to length 300.

def pad_data(sequences, max_length):
  # Keras has a convenient utility for padding a sequence.
  # Also make sure we get a numpy array rather than an array of lists.
  return np.array(list(
      tf.keras.preprocessing.sequence.pad_sequences(
          sequences, maxlen=max_length, padding='post', value=0)))

# Pad and truncate to 300 tokens.
train_data_padded = pad_data(training_data['text'], max_length=300)

# Check the padded output.
print('Length of X_train[0]:', len(training_data['text']))
print('Length of X_train_padded[0]:', len(train_data_padded[0]))
print(train_data_padded[0])

In [None]:
def limit_vocab(sequences, max_token_id, oov_id=1):
  """Replace token ids greater than or equal to max_token_id with the oov_id."""
  reduced_sequences = np.copy(sequences)
  reduced_sequences[reduced_sequences >= max_token_id] = oov_id
  return reduced_sequences
# print(decode(X_train[0]))
# Reduce vocabulary to 1000 tokens.
X_train_reduced = limit_vocab(train_data_padded, max_token_id=1000)

print(X_train_reduced[0])

In [None]:
# Keras has a util to create one-hot encodings.
X_train_padded = pad_data(training_data['text'], max_length=20)
X_train_reduced = limit_vocab(X_train_padded, max_token_id=1000)
X_train_one_hot = tf.keras.utils.to_categorical(X_train_reduced)
print(X_train_reduced[0])
print('X_train_one_hot shape:', X_train_one_hot.shape)

In [None]:
def build_onehot_model(average_over_positions=False):
  """Build a tf.keras model for one-hot data."""
  # Clear session and remove randomness.
  tf.keras.backend.clear_session()
  tf.keras.utils.set_random_seed(0)

  model = tf.keras.Sequential()
  if average_over_positions:
    # This layer averages over the first dimension of the input by default.
    model.add(tf.keras.layers.GlobalAveragePooling1D())
  else:
    # Concatenate.
    model.add(tf.keras.layers.Flatten())
  # extra layer
  model.add(tf.keras.layers.Dense(
        units=50,                     # output dim (for binary classification)
        activation="sigmoid"         # sigmoid activation for classification
    ))

  model.add(tf.keras.layers.Dense(
      units=1,                     # output dim (for binary classification)
      activation="sigmoid"         # sigmoid activation for classification
  ))

  model.compile(loss='binary_crossentropy',   # this is a classification task
                optimizer='adam',             # fancy optimizer
                metrics=['accuracy'])

  return model

In [None]:
def plot_history(history):
  plt.ylabel('Loss')
  plt.xlabel('Epoch')
  plt.xticks(range(0, len(history['loss'] + 1)))
  plt.plot(history['loss'], label="training", marker='o')
  plt.plot(history['val_loss'], label="validation", marker='o')
  plt.legend()
  plt.show()

In [None]:
model = build_onehot_model()

# Fit the model.
history = model.fit(
  x = X_train_one_hot,  # one-hot training data
  y = training_data['target'],          # corresponding binary labels
  epochs=5,             # number of passes through the training data
  batch_size=64,        # mini-batch size
  validation_split=0.1, # use a fraction of the examples for validation
  verbose=1             # display some progress output during training
  )

# Convert the return value into a DataFrame so we can see the train loss
# and binary accuracy after every epoch.
history = pd.DataFrame(history.history)
plot_history(history)

In [None]:
def build_embeddings_model(average_over_positions=False,
                           vocab_size=1000,
                           sequence_length=20,
                           embedding_dim=2):
  """Build a tf.keras model using embeddings."""
  # Clear session and remove randomness.
  tf.keras.backend.clear_session()
  tf.keras.utils.set_random_seed(0)

  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Embedding(
      input_dim=vocab_size,
      output_dim=embedding_dim,
      input_length=sequence_length)
  )

  if average_over_positions:
    # This layer averages over the first dimension of the input by default.
    model.add(tf.keras.layers.GlobalAveragePooling1D())
  else:
    # Concatenate.
    model.add(tf.keras.layers.Flatten())

  # extra layer
  model.add(tf.keras.layers.Dense(
        units=50,                     # output dim (for binary classification)
        activation="sigmoid"         # sigmoid activation for classification
    ))

  model.add(tf.keras.layers.Dense(
      units=1,                     # output dim (for binary classification)
      activation='sigmoid'         # apply the sigmoid function!
  ))

  model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

  return model

In [None]:
model = build_embeddings_model(average_over_positions=True,
                               vocab_size=1000,
                               sequence_length=20,
                               embedding_dim=64)

history = model.fit(
  x = X_train_reduced,  # our sparse padded training data
  y = training_data['target'],          # corresponding binary labels
  epochs=5,             # number of passes through the training data
  batch_size=64,        # mini-batch size
  validation_split=0.1, # use a fraction of the examples for validation
  verbose=1             # display some progress output during training
  )

history = pd.DataFrame(history.history)
plot_history(history)

In [None]:
#Output CSV
print(test_df)
test_df['text'] = test_df['text'].apply(encode)
test_data_padded = pad_data(test_df['text'], max_length=20)
X_test_reduced = limit_vocab(test_data_padded, max_token_id=1000)
# print(X_test_reduced)
predicted_probabilities = model.predict(X_test_reduced)
print(predicted_probabilities)
predicted_classes = (predicted_probabilities > 0.5).astype(int)
output_kaggle_test(predicted_classes)

In [None]:
def build_bow_model(vocab_size=1000,
                    sequence_length=20,
                    embedding_dim=2):
  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Embedding(
      input_dim=vocab_size,
      output_dim=embedding_dim,
      input_length=sequence_length)
  )
  model.add(tf.keras.layers.GlobalAveragePooling1D())
  model.add(tf.keras.layers.Dense(units=32, activation='sigmoid'))
  model.add(tf.keras.layers.Dense(units=32, activation='sigmoid'))
  model.add(tf.keras.layers.Dense(
      units=1,                     # output dim (for binary classification)
      activation='sigmoid'         # apply the sigmoid function!
  ))

  model.compile(loss='binary_crossentropy', optimizer='adam',
                metrics=['accuracy'])
  return model

bow_model = build_bow_model(
    vocab_size=1000, sequence_length=20, embedding_dim=16)

X_train_padded = pad_data(training_data['text'], max_length=20)
X_train_reduced = limit_vocab(X_train_padded, max_token_id=1000)
X_train_one_hot = tf.keras.utils.to_categorical(X_train_reduced)

history = model.fit(
  x = X_train_reduced,  # our sparse padded training data
  y = training_data['target'],          # corresponding binary labels
  epochs=5,             # number of passes through the training data
  batch_size=64,        # mini-batch size
  validation_split=0.1, # use a fraction of the examples for validation
  verbose=1             # display some progress output during training
  )

history = pd.DataFrame(history.history)
plot_history(history)

In [None]:
# Functional Keras API
def build_cnn_model(vocab_size,
                    sequence_length,
                    embedding_dim):
  x = tf.keras.Input(shape=(sequence_length))

  emb = tf.keras.layers.Embedding(input_dim=vocab_size,
                                  output_dim=embedding_dim,
                                  input_length=sequence_length)(x)

  c1 = tf.keras.layers.Conv1D(
      filters=32, kernel_size=3, padding='same', activation='relu')(emb)

  c2 = tf.keras.layers.Conv1D(
      filters=32, kernel_size=4, padding='same', activation='relu')(emb)

  c3 = tf.keras.layers.Conv1D(
      filters=32, kernel_size=5, padding='same', activation='relu')(emb)


  y = tf.keras.layers.Concatenate()([c1, c2, c3])
  y = tf.keras.layers.Dropout(rate=0.05)(y)
  y = tf.keras.layers.MaxPool1D(pool_size=sequence_length)(y)
  y = tf.keras.layers.Flatten()(y)
  y = tf.keras.layers.Dense(units=1, activation='sigmoid')(y)

  model = tf.keras.Model(inputs=x, outputs=y, name='imdb_cnn')

  model.compile(loss='binary_crossentropy', optimizer='adam',
                metrics=['accuracy'])
  return model

cnn_model = build_cnn_model(
    vocab_size=1000, sequence_length=20, embedding_dim=128)

X_train_padded = pad_data(training_data['text'], max_length=20)
X_train_reduced = limit_vocab(X_train_padded, max_token_id=1000)
X_train_one_hot = tf.keras.utils.to_categorical(X_train_reduced)

history = model.fit(
  x = X_train_reduced,  # our sparse padded training data
  y = training_data['target'],          # corresponding binary labels
  epochs=5,             # number of passes through the training data
  batch_size=64,        # mini-batch size
  validation_split=0.1, # use a fraction of the examples for validation
  verbose=1             # display some progress output during training
  )

history = pd.DataFrame(history.history)
plot_history(history)

In [None]:
def create_neural_network():
    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=(28, 28, 1)),
        tf.keras.layers.MaxPooling2D(2,2),
        tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
        tf.keras.layers.MaxPooling2D(2,2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(1, activation='softmax')
    ])

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    return model
neural_model = build_cnn_model(
    vocab_size=1000, sequence_length=20, embedding_dim=128)

X_train_padded = pad_data(training_data['text'], max_length=20)
X_train_reduced = limit_vocab(X_train_padded, max_token_id=1000)
X_train_one_hot = tf.keras.utils.to_categorical(X_train_reduced)

history = neural_model.fit(
  x = X_train_reduced,  # our sparse padded training data
  y = training_data['target'],          # corresponding binary labels
  epochs=5,             # number of passes through the training data
  batch_size=64,        # mini-batch size
  validation_split=0.1, # use a fraction of the examples for validation
  verbose=1             # display some progress output during training
  )

history = pd.DataFrame(history.history)
plot_history(history)