<a href="https://colab.research.google.com/github/suhailamohammed/nlp-disaster-tweet-classifier/blob/main/disaster_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

In [None]:
!pip install kaggle



In [None]:
import os

os.environ['KAGGLE_USERNAME'] = 'kaggle your username'
os.environ['KAGGLE_KEY'] = 'kaggle token key'

In [None]:
!kaggle competitions download -c nlp-getting-started
!unzip nlp-getting-started.zip

Downloading nlp-getting-started.zip to /content
  0% 0.00/593k [00:00<?, ?B/s]
100% 593k/593k [00:00<00:00, 598MB/s]
Archive:  nlp-getting-started.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [None]:
train_df = pd.read_csv('train.csv')

In [None]:
print(train_df.head())

   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  


In [None]:
train_df[train_df["target"] == 0].text

Unnamed: 0,text
15,What's up man?
16,I love fruits
17,Summer is lovely
18,My car is so fast
19,What a goooooooaaaaaal!!!!!!
...,...
7581,@engineshed Great atmosphere at the British Li...
7582,Cramer: Iger's 3 words that wrecked Disney's s...
7584,These boxes are ready to explode! Exploding Ki...
7587,Sirens everywhere!


In [None]:
count_disaster_text = train_df[train_df["target"] == 1].shape[0]
count_non_disaster_text = train_df[train_df["target"] == 0].shape[0]

print(count_disaster_text, count_non_disaster_text)

3271 4342


In [None]:
train_target = train_df.target

train_df = train_df.drop("target", axis=1)

In [None]:
print(train_df, train_target)

         id keyword location  \
0         1     NaN      NaN   
1         4     NaN      NaN   
2         5     NaN      NaN   
3         6     NaN      NaN   
4         7     NaN      NaN   
...     ...     ...      ...   
7608  10869     NaN      NaN   
7609  10870     NaN      NaN   
7610  10871     NaN      NaN   
7611  10872     NaN      NaN   
7612  10873     NaN      NaN   

                                                   text  
0     Our Deeds are the Reason of this #earthquake M...  
1                Forest fire near La Ronge Sask. Canada  
2     All residents asked to 'shelter in place' are ...  
3     13,000 people receive #wildfires evacuation or...  
4     Just got sent this photo from Ruby #Alaska as ...  
...                                                 ...  
7608  Two giant cranes holding a bridge collapse int...  
7609  @aria_ahrary @TheTawniest The out of control w...  
7610  M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...  
7611  Police investigating after an

## Dataset cleaning

In [None]:
import re

def clean_text(text):
  #remove space
  text = text.strip()

  #lower all words into lower case
  text = text.lower()

  #remove hashtags
  text = text.replace("#", '')

  #remove urls "https//"
  text = re.sub(r"http\S+", "", text)

  #remove mentions @
  text = re.sub(r"@\w+", "", text)

  #remove numbers or punctuations
  text = re.sub(r"[^a-z\s]", "", text)

  return text

train_df["cleaned_text"] = train_df["text"].apply(clean_text)

In [None]:
print(train_df.cleaned_text.head())

0    our deeds are the reason of this earthquake ma...
1                forest fire near la ronge sask canada
2    all residents asked to shelter in place are be...
3     people receive wildfires evacuation orders in...
4    just got sent this photo from ruby alaska as s...
Name: cleaned_text, dtype: object


## Converting texts into vectors

In [None]:
!pip install gensim

###Using Word2Vec method

In [None]:
import gensim.downloader as api

# Load pre-trained Word2Vec embeddings
word2vec_model = api.load("word2vec-google-news-300")




In [None]:
def tweet_to_vectors(text):
  tokens = text.split()
  vectors = [word2vec_model[token] for token in tokens if token in word2vec_model]

  if len(vectors) == 0:
    return np.zeros(word2vec_model.vector_size)

  return np.mean(vectors, axis= 0)

text_to_vectors = train_df["cleaned_text"].apply(tweet_to_vectors)

In [None]:
print(text_to_vectors.head())

0    [0.07192993, 0.072657265, 0.11334864, 0.144185...
1    [0.08525391, -0.011193847, 0.0027526855, 0.136...
2    [-0.0104747955, -0.0020635696, 0.088192895, 0....
3    [0.15774973, -0.069492884, 0.024989536, 0.1913...
4    [0.061075848, 0.021800868, -0.027213542, 0.076...
Name: cleaned_text, dtype: object


###Using Bert Embeddings

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model_bert = AutoModel.from_pretrained("distilbert-base-uncased")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
import numpy as np
import torch

def get_cls_embeddings_batch(texts, batch_size=32):
    embeddings = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        tokens = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128)

        with torch.no_grad():
            outputs = model_bert(**tokens)
            cls_batch = outputs.last_hidden_state[:,0,:].numpy()

        embeddings.append(cls_batch)

    return np.vstack(embeddings)

tweet_embeddings = get_cls_embeddings_batch(train_df.cleaned_text.tolist(), batch_size=32)

In [None]:
print(tweet_embeddings)

[[-0.00865033  0.18199196 -0.32401228 ...  0.0197432   0.5983778
  -0.07121403]
 [-0.41640204  0.19298902 -0.5031334  ... -0.20960745  0.13687709
   0.3023416 ]
 [-0.12580562 -0.13072777  0.30824253 ... -0.22407363  0.00092162
   0.3074311 ]
 ...
 [-0.32509398  0.06208956  0.08049329 ... -0.1205381   0.37776268
   0.42699742]
 [-0.3837271  -0.35905495 -0.1316195  ... -0.32599795  0.11911261
   0.35433948]
 [-0.18678047 -0.20065098  0.05963195 ... -0.21261446  0.5797842
   0.01546859]]


## Train the model (Logistic regression)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
#Initialize model

model = LogisticRegression(max_iter=200, C=1.0, solver='lbfgs', random_state=42)

In [None]:
x = np.vstack(text_to_vectors)
y = train_target.values

In [None]:
x = tweet_embeddings
y = train_target.values

In [None]:
model.fit(x, y)

##Test model

In [None]:
test_df = pd.read_csv('test.csv')

test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [None]:
text_to_vectors_test = test_df["text"].apply(clean_text).apply(tweet_to_vectors)

In [None]:
test_tweet_embeddings = get_cls_embeddings_batch((test_df["text"].apply(clean_text)).tolist())

In [None]:
y_pred = model.predict(np.vstack(text_to_vectors_test))

In [None]:
y_pred = model.predict(test_tweet_embeddings)

In [None]:
sample_submission = pd.read_csv('sample_submission.csv')

In [None]:
sample_submission["target"] = y_pred

In [None]:
print(sample_submission.head())

   id  target
0   0       1
1   2       1
2   3       1
3   9       1
4  11       1


In [None]:
sample_submission.to_csv("submission.csv", index=False)