Competition description

Twitter has become an important communication channel in times of emergency.
The ubiquitousness of smartphones enables people to announce an emergency they’re observing in real-time. Because of this, more agencies are interested in programatically monitoring Twitter (i.e. disaster relief organizations and news agencies).

But, it’s not always clear whether a person’s words are actually announcing a disaster.

Acknowledgments

This dataset was created by the company figure-eight and originally shared on their ‘Data For Everyone’ website here.

Import libraries

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)
%matplotlib inline

Loads train, test, and sample files

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Read

In [None]:
train  = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
submission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

In [None]:
train

In [None]:
test

In [None]:
submission

Combine train and test

In [None]:
target = train.target
train.drop('target', axis=1, inplace=True)
train

In [None]:
combi = train.append(test)
combi

Impute any null values

In [None]:
combi.isnull().sum()

In [None]:
# substitue NaN value here with mode

combi['location'] = combi['location'].fillna("not listed")
combi['keyword'] = combi['keyword'].fillna("not listed")

In [None]:
combi.isnull().sum()

Compare tweets in train and test file

In [None]:
#compare tweets in train and test file
length_train=train['text'].str.len()
length_test=test['text'].str.len()
plt.hist(length_train, bins=20, label="train_tweets")
plt.hist(length_test, bins=20, label="test_tweets")
plt.legend()
plt.show()

Identify percentage of disaster tweets

In [None]:
target.value_counts()

In [None]:
percentage_disaster=(target.value_counts() / len(train)) * 100
percentage_disaster

In [None]:
sns.distplot(target)

Preprocessing raw text and getting it ready for machine learning

In [None]:
tweets = combi['text']

count_words = tweets.str.findall(r'(\w+)').str.len()
print(count_words.sum())

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

stopwords = set(stopwords.words("english"))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

""" Cleaning Tweets """
tweets = tweets.str.lower()

#stem the text
tweets = tweets.apply(lambda x: " ".join([stemmer.stem(i)
for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in stopwords]).lower())

#lemmatize the text
tweets = tweets.apply(lambda x: " ".join([lemmatizer.lemmatize(i)
for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in stopwords]).lower())

# removing special characters and numbers
tweets = tweets.apply(lambda x : re.sub("[^a-z]\s","",x) )

# remove hash tags
tweets = tweets.str.replace("#", "")

#remove words less than 3 character and greater than 7
tweets = tweets.apply(lambda x: ' '.join([w for w in x.split() if len(w)>2 and len(w)<8]))

# removing stopwords
tweets = tweets.apply(lambda x : " ".join(word for word in x.split() if word not in stopwords ))

count_words = tweets.str.findall(r'(\w+)').str.len()
print(count_words.sum())

In [None]:
count_words = tweets.str.findall(r'(\w+)').str.len()
print(count_words.sum())

Remove frequently used words

In [None]:
most_freq_words = pd.Series(' '.join(tweets).lower().split()).value_counts()[:25]
tweets = tweets.apply(lambda x : " ".join(word for word in x.split() if word not in most_freq_words ))
print(most_freq_words)

count_words = tweets.str.findall(r'(\w+)').str.len()
print(count_words.sum())

Remove rare words

In [None]:
from collections import Counter
from itertools import chain

# split words into lists
v = tweets.str.split().tolist() 
# compute global word frequency
c = Counter(chain.from_iterable(v))
# filter, join, and re-assign
tweets = [' '.join([j for j in i if c[j] > 1]) for i in v]

total_word = 0
for x,word in enumerate(tweets):
    num_word = len(word.split())
    #print(num_word)
    total_word = total_word + num_word
print(total_word)

Create tokens in spacy

In [None]:
import spacy
import spacy.cli
spacy.cli.download("en_vectors_web_lg")
nlp = spacy.load('en_vectors_web_lg')

In [None]:
import spacy
import en_vectors_web_lg

nlp = en_vectors_web_lg.load()
document = nlp(tweets[0])
print("Document : ",document)
print("Tokens : ")
for token in document:
       print(token.text)

Token to vector

In [None]:
document = nlp(tweets[0])
print(document)
for token in document:
    print(token.text, token.vector.shape)

Sentence to vector using pipe

In [None]:
document = nlp.pipe(tweets)
tweets_vector = np.array([tweet.vector for tweet in document])
print(tweets_vector.shape)

Define X, y and X_test

In [None]:
#define x, y and t_test
y = target
X=tweets_vector[: len(train)]
X_test=tweets_vector[len(train) :]

Split train set for training and testing

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.10, random_state=42, shuffle=True)
X_train.shape, X_val.shape, y_train.shape,y_val.shape, X_test.shape

Define and train the model

In [None]:

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=10,max_iter=1000).fit(X_train, y_train)
print(model.score(X_train, y_train))


Predict on validation set

In [None]:
y_pred = model.predict(X_val)
print(model.score(X_val, y_val))

In [None]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_val, y_pred))

In [None]:
df=pd.DataFrame({'Actual': y_val, 'Predicted':y_pred})
df

Predict on test set

In [None]:
test_pred = model.predict(X_test)
test_pred

Submission

In [None]:
submission['target'] = test_pred
submission.to_csv('submission.csv',index=False) # writing data to a CSV file
submission = pd.read_csv("submission.csv")
submission