<a href="https://colab.research.google.com/github/sazad007/NLP-Partly_Sunny_with_a_Chance_of_Hashtags/blob/main/Partly_Sunny_with_a_Chance_of_Hashtags.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [69]:
import numpy as np
import pandas as pd
import re
import nltk
from google.colab import drive
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from tensorflow.keras.losses import KLDivergence

In [59]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [60]:
!pip install gensim



In [61]:
from gensim.models import KeyedVectors
word2vec = KeyedVectors.load('/content/drive/MyDrive/word2vec-google-news.model')

In [62]:
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
stop_words -= {'not', 'no', 'very'}

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [63]:
data = pd.read_csv('train.csv')
X = data['tweet'].values
y = data.iloc[:, 4:].values

In [64]:
def _tokenizer(sentence):
  sentence = re.sub('[^a-zA-Z]', ' ', sentence).lower().split()
  words = [word for word in sentence if word in word2vec and word not in stop_words]
  if not words:
    return np.zeros(word2vec.vector_size)
  else:
    return np.mean([word2vec[word] for word in words], axis=0)


In [65]:
X_vec = np.array([_tokenizer(sentence) for sentence in X])

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

y_train_s = y_train[:, 0:5]
y_train_w = y_train[:, 5:9]
y_train_k = y_train[:, 9:]

y_test_s = y_test[:, 0:5]
y_test_w = y_test[:, 5:9]
y_test_k = y_test[:, 9:]

In [71]:
input_layer = Input(shape=(word2vec.vector_size,))
x = Dense(units=600, activation='relu')(input_layer)
x = Dropout(0.5)(x)
x = Dense(units=300, activation='relu')(x)

sentiment_ = Dense(units=5, activation='softmax', name='sentiment')(x)
when_ =  Dense(units=4, activation='softmax', name='when')(x)
kind_ = Dense(units=15, activation='sigmoid', name='kind')(x)


model = Model(
    inputs = input_layer,
    outputs= [sentiment_, when_, kind_]
)

losses = {
    'sentiment': 'categorical_crossentropy',
    'when': 'categorical_crossentropy',
    'kind': 'binary_crossentropy'
}

metrics = {
    'sentiment': 'accuracy',
    'when': 'accuracy',
    'kind': 'accuracy'
}

model.compile(optimizer='adam', loss=losses, metrics=metrics)
model.fit(x=X_train, y=[y_train_s, y_train_w, y_train_k], epochs=15, validation_data=(X_test, [y_test_s, y_test_w, y_test_k]), batch_size=64)

Epoch 1/15
[1m975/975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 15ms/step - kind_accuracy: 0.4719 - kind_loss: 0.2248 - loss: 2.1663 - sentiment_accuracy: 0.5802 - sentiment_loss: 1.1644 - when_accuracy: 0.7776 - when_loss: 0.7771 - val_kind_accuracy: 0.7270 - val_kind_loss: 0.1341 - val_loss: 1.8784 - val_sentiment_accuracy: 0.6504 - val_sentiment_loss: 1.0426 - val_when_accuracy: 0.7966 - val_when_loss: 0.7016
Epoch 2/15
[1m975/975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 13ms/step - kind_accuracy: 0.7290 - kind_loss: 0.1339 - loss: 1.8865 - sentiment_accuracy: 0.6453 - sentiment_loss: 1.0519 - when_accuracy: 0.7990 - when_loss: 0.7007 - val_kind_accuracy: 0.7725 - val_kind_loss: 0.1197 - val_loss: 1.8431 - val_sentiment_accuracy: 0.6528 - val_sentiment_loss: 1.0278 - val_when_accuracy: 0.7974 - val_when_loss: 0.6955
Epoch 3/15
[1m975/975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 16ms/step - kind_accuracy: 0.7551 - kind_loss: 0.1239 - loss: 

<keras.src.callbacks.history.History at 0x7ae6d83dab10>

In [72]:
results = model.evaluate(X_test, [y_test_s, y_test_w, y_test_k])
print("Test loss and accuracy:", results)

[1m488/488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - kind_accuracy: 0.8004 - kind_loss: 0.1061 - loss: 1.7657 - sentiment_accuracy: 0.6817 - sentiment_loss: 0.9851 - when_accuracy: 0.8062 - when_loss: 0.6745
Test loss and accuracy: [1.7778160572052002, 0.994020402431488, 0.6788996458053589, 0.10500539094209671, 0.8043617606163025, 0.6819756031036377, 0.806735098361969]


In [73]:
sentence = 'Very bad weather today'
X_v = np.array([_tokenizer(sentence)])

y_pred = model.predict(X_v)

s, w, k = y_pred

s = pd.DataFrame(s)
print(s)

w = pd.DataFrame(w)
print(w)

k = pd.DataFrame(k)
print(k)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step
          0         1         2         3         4
0  0.039956  0.785734  0.065022  0.080945  0.028343
          0         1         2         3
0  0.923357  0.006972  0.027897  0.041774
        0         1         2         3         4         5        6   \
0  0.02167  0.001254  0.001406  0.002118  0.000848  0.000177  0.93524   

         7       8         9         10        11       12        13        14  
0  0.000265  0.1118  0.007221  0.000143  0.002793  0.00513  0.000384  0.000643  


In [74]:
test_data = pd.read_csv('test.csv')
X_t = test_data['tweet'].values
X_t = np.array([_tokenizer(sentence) for sentence in X_t])

y_pred = model.predict(X_t)
s, w, k = y_pred

[1m1318/1318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step


In [75]:
np.savetxt('output.csv', np.column_stack((test_data['id'], s, w, k)), header='id,s1,s2,s3,s4,s5,w1,w2,w3,w4,k1,k2,k3,k4,k5,k6,k7,k8,k9,k10,k11,k12,k13,k14,k15', comments='', delimiter=',', fmt=['%d', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f'])