<a href="https://colab.research.google.com/github/sazad007/NLP-CrowdFlower-Weather-Prediction-with-Twitter/blob/main/Partly_Sunny_with_a_Chance_of_Hashtags.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from google.colab import drive
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from tensorflow.keras.losses import KLDivergence

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install gensim



In [4]:
from gensim.models import KeyedVectors
word2vec = KeyedVectors.load('/content/drive/MyDrive/word2vec-google-news.model')

In [5]:
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
stop_words -= {'not', 'no', 'very'}

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
data = pd.read_csv('train.csv')
X = data['tweet'].values
y = data.iloc[:, 4:].values

In [7]:
def _tokenizer(sentence):
  sentence = re.sub(r'[^a-zA-Z]', ' ', sentence)
  sentence = re.sub(r'http\S+', ' ', sentence)
  sentence = re.sub(r'\s+', ' ', sentence)
  sentence = sentence.lower().split()
  words = [word for word in sentence if word in word2vec and word not in stop_words]
  if not words:
    return np.zeros(word2vec.vector_size)
  else:
    return np.mean([word2vec[word] for word in words], axis=0)


In [8]:
X_vec = np.array([_tokenizer(sentence) for sentence in X])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

y_train_s = y_train[:, 0:5]
y_train_w = y_train[:, 5:9]
y_train_k = y_train[:, 9:]

y_test_s = y_test[:, 0:5]
y_test_w = y_test[:, 5:9]
y_test_k = y_test[:, 9:]

In [10]:
input_layer = Input(shape=(word2vec.vector_size,))
x = Dense(units=600, activation='relu')(input_layer)
x = Dropout(0.5)(x)
x = Dense(units=300, activation='relu')(x)

sentiment_ = Dense(units=5, activation='softmax', name='sentiment')(x)
when_ =  Dense(units=4, activation='softmax', name='when')(x)
kind_ = Dense(units=15, activation='sigmoid', name='kind')(x)


model = Model(
    inputs = input_layer,
    outputs= [sentiment_, when_, kind_]
)

losses = {
    'sentiment': 'categorical_crossentropy',
    'when': 'categorical_crossentropy',
    'kind': 'binary_crossentropy'
}

metrics = {
    'sentiment': 'accuracy',
    'when': 'accuracy',
    'kind': 'accuracy'
}

model.compile(optimizer='adam', loss=losses, metrics=metrics)
model.fit(x=X_train, y=[y_train_s, y_train_w, y_train_k], epochs=15, validation_data=(X_test, [y_test_s, y_test_w, y_test_k]), batch_size=64)

Epoch 1/15
[1m975/975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 14ms/step - kind_accuracy: 0.4649 - kind_loss: 0.2314 - loss: 2.1760 - sentiment_accuracy: 0.5828 - sentiment_loss: 1.1636 - when_accuracy: 0.7771 - when_loss: 0.7810 - val_kind_accuracy: 0.7298 - val_kind_loss: 0.1318 - val_loss: 1.8911 - val_sentiment_accuracy: 0.6405 - val_sentiment_loss: 1.0587 - val_when_accuracy: 0.7960 - val_when_loss: 0.7005
Epoch 2/15
[1m975/975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 13ms/step - kind_accuracy: 0.7271 - kind_loss: 0.1332 - loss: 1.8754 - sentiment_accuracy: 0.6467 - sentiment_loss: 1.0455 - when_accuracy: 0.7970 - when_loss: 0.6966 - val_kind_accuracy: 0.7666 - val_kind_loss: 0.1189 - val_loss: 1.8450 - val_sentiment_accuracy: 0.6539 - val_sentiment_loss: 1.0309 - val_when_accuracy: 0.7996 - val_when_loss: 0.6951
Epoch 3/15
[1m975/975[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - kind_accuracy: 0.7516 - kind_loss: 0.1241 - loss: 

<keras.src.callbacks.history.History at 0x7a1d7a7b7050>

In [11]:
results = model.evaluate(X_test, [y_test_s, y_test_w, y_test_k])
print("Test loss and accuracy:", results)

[1m488/488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - kind_accuracy: 0.8046 - kind_loss: 0.1058 - loss: 1.7619 - sentiment_accuracy: 0.6851 - sentiment_loss: 0.9856 - when_accuracy: 0.8099 - when_loss: 0.6705
Test loss and accuracy: [1.7731682062149048, 0.994252622127533, 0.6743028163909912, 0.10482947528362274, 0.8028864860534668, 0.6799871921539307, 0.8106478452682495]


In [16]:
sentence = 'it was a bad stormy day yesterday'
X_v = np.array([_tokenizer(sentence)])

y_pred = model.predict(X_v)

s, w, k = y_pred

s = pd.DataFrame(s)
print(s)

w = pd.DataFrame(w)
print(w)

k = pd.DataFrame(k)
print(k)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
          0         1         2         3         4
0  0.059585  0.708592  0.071413  0.016249  0.144162
          0         1         2         3
0  0.468103  0.034783  0.057896  0.439217
         0         1        2         3         4         5         6   \
0  0.137086  0.107765  0.00181  0.019486  0.001018  0.000589  0.119888   

         7         8         9         10        11        12       13  \
0  0.010288  0.068035  0.074769  0.001715  0.573925  0.100686  0.00024   

         14  
0  0.022258  


In [13]:
test_data = pd.read_csv('test.csv')
X_t = test_data['tweet'].values
X_t = np.array([_tokenizer(sentence) for sentence in X_t])

y_pred = model.predict(X_t)
s, w, k = y_pred

[1m1318/1318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step


In [14]:
np.savetxt('output.csv', np.column_stack((test_data['id'], s, w, k)), header='id,s1,s2,s3,s4,s5,w1,w2,w3,w4,k1,k2,k3,k4,k5,k6,k7,k8,k9,k10,k11,k12,k13,k14,k15', comments='', delimiter=',', fmt=['%d', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f', '%.3f'])