In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import nltk

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import re
from sklearn.feature_extraction.text import TfidfVectorizer

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense

In [None]:
# Load the data, and separate the target
path = "/kaggle/input/nlp-getting-started/train.csv"
dtrain = pd.read_csv(path)
path1 = "/kaggle/input/nlp-getting-started/test.csv"
dtest = pd.read_csv(path1)


df = pd.concat((dtrain, dtest))

**Data Exploration**

In [None]:
df

In [None]:
## How many are positive (1) and negative (0) in percent:
print(dtrain['target'].mean())

In [None]:
df['keyword'].value_counts()

In [None]:
print(df['keyword'].isna().sum()/len(df['keyword']))

print(df['location'].isna().sum()/len(df['location']))

In [None]:
df.groupby(['keyword'])['target'].mean()

In [None]:
df['location'].value_counts()

**Feature extraction**

1.) convert to lowercase

In [None]:
df['text'] = (df['text']).str.lower()

2.) Handling location data

In [None]:
df['location'] = df['location'].fillna('None')
df['keyword'] = df['keyword'].fillna('None')

loc = df.location.unique()

Make new list for locations which appear at least 10 times, all other rows will get location 'Unknown'

In [None]:
listt = []
for l in loc:
    if df.location.value_counts()[l] > 9:
        listt.append(l)
        
print(listt)

In [None]:
df['location'] = [ll if ll in listt else 'Unknown' for ll in df['location']]

print(df.location.unique())
print(len(df.location.unique()))

Closer inspection of the locations kept shows that some locations appear multiple times and can be grouped like 'US' with 'USA' or 'New York' and 'New York, NY'

In [None]:
df['location'] = np.where(df['location'].isin(['USA', 'United States']),'US',df['location'])
df['location'] = np.where(df['location'].isin(['United Kingdom']),'UK',df['location'])
df['location'] = np.where(df['location'].isin(['London, UK', 'London, England']),'London',df['location'])
df['location'] = np.where(df['location'].isin(['San Francisco, CA']),'San Francisco',df['location'])
df['location'] = np.where(df['location'].isin(['Washington, D.C.']),'Washington, DC',df['location'])
df['location'] = np.where(df['location'].isin(['Los Angeles, CA']),'Los Angeles',df['location'])
df['location'] = np.where(df['location'].isin(['New York, NY', 'New York City', 'NYC']),'New York',df['location'])
df['location'] = np.where(df['location'].isin(['California, USA']),'California',df['location'])
df['location'] = np.where(df['location'].isin(['Chicago, IL']),'Chicago',df['location'])
df['location'] = np.where(df['location'].isin(['Denver, CO', 'Denver, Colorado']),'Denver',df['location'])
df['location'] = np.where(df['location'].isin(['Seattle, WA']),'Seattle',df['location'])

In [None]:
print(df.groupby(['location'])['target'].mean())

Based on the mean of the target value in the test set the location data seems to be useful

3.) Since both Location and Keyword are categorical features they need to be OneHotEncoded to be useful. This can be done using pd.get_dummies

In [None]:
dummies = pd.get_dummies(df['location'], drop_first=True)
dummies1 = pd.get_dummies(df['keyword'], drop_first=True)

Next we define a function to get the length of a text

In [None]:
def remove_punc(text):
    new = re.sub(r'[^\w\s]', '', text)
    return new

def remove_mentions(text):
    new = re.sub("@\S+", "", text)
    return new

def remove_url(text):
    new = re.sub("https?:\/\/.*[\r\n]*", "", text)
    return new

def remove_hashtag(text):
    new = re.sub("#", "", text)
    return new

def clean_text(text):
    new = remove_url(text)
    new = remove_hashtag(new)
    new = remove_mentions(new)
    new = remove_punc(new)
    return new

def leng(col):
    text = word_tokenize(col, language='english')#word_tokenize
    nonPunct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit
    filtered = [w for w in text if nonPunct.match(w)]
    return len(filtered)

def char(col):
    #l1=[]
    #for rew in col:
    text = word_tokenize(col, language='english')#word_tokenize
    nonPunct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit
    filtered = [w for w in text if nonPunct.match(w)]
    #print(text)
    s = [len(u) for u in filtered]
    #l1.append(sum(s))
    return sum(s)

In [None]:
clean_text('#flood #disaster Hello World!!.? @ElonMusk, https://kaggle.com')

Using this function we can get the word count of a tweet. Furthermore we define a new column with the word count and standartize it afterwards.

In [None]:
df

In [None]:
df['wordCount'] = df['text'].apply(lambda x: leng(x))
dtrain['wordCount'] = dtrain['text'].apply(lambda x: leng(x))

df['charCount'] = df['text'].apply(lambda x: char(x))
dtrain['charCount'] = dtrain['text'].apply(lambda x: char(x))

df['char/wrd'] = df['charCount']/df['wordCount']

wrdcm = dtrain['wordCount'].mean()
wrdcs = dtrain['wordCount'].std()

chm = dtrain['charCount'].mean()
chsd = dtrain['charCount'].std()


print(df[df['target']==1]['char/wrd'].mean())
print(df[df['target']==0]['char/wrd'].mean())

print(df[df['target']==1]['charCount'].mean())
print(df[df['target']==0]['charCount'].mean())

print(df[df['target']==1]['wordCount'].mean())
print(df[df['target']==0]['wordCount'].mean())

#df['wordCount'] = (df['wordCount']-wrdcm)/wrdcs #standartised
#df['charCount'] = (df['charCount']-chm)/chsd

Average word count is very similiar for both disaster and non disaster tweets but disaster tweets have more characters per words on average.

Next apply lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer

wl = WordNetLemmatizer()
 
# single word lemmatization examples
list1 = ['kites', 'babies', 'dogs', 'flying', 'smiling',
         'driving', 'died', 'tried', 'feet']
for words in list1:
    print(words + " ---> " + wl.lemmatize(words))
    
print(wl.lemmatize('hands, birds cars'))

In [None]:
from nltk.corpus import wordnet
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

In [None]:
df['text'] = df['text'].apply(lambda x: clean_text(x))
df['text'] = df['text'].apply(lambda x: lemmatizer(x))

Next we can add the dummie columns

In [None]:
df = pd.concat((df, dummies), axis=1)
df = pd.concat((df, dummies1), axis=1)

df.head()

For later use we define the feature columns as all columns other than 'id', 'keyword', 'target', 'text', 'location'

In [None]:
features = df.columns.drop(['id','keyword','target', 'text', 'location'])

4.) Text Vectorization: This will be done using TfidfVectorizer 

In [None]:
stpwrdlist=['the', 'a', 'an']#Custom list of stopwords 
vectorizer = TfidfVectorizer(max_df=0.9, min_df=0.001, stop_words=stpwrdlist, ngram_range=(1,3)) #stop_words=stpwrdlist

Split df into train and test set:

In [None]:
train = df[df['target'].notna()]
test =df[df['target'].isna()]
y = train['target']
train = train.drop('target', axis=1)

Fit vectorizer and transform test set

In [None]:
X = vectorizer.fit_transform(train['text'])

X_test = vectorizer.transform(test['text'])

5.) Concatenate features from vectorizer with previous features

In [None]:
X = np.concatenate((X.toarray(), train[features]), axis=1)

X_test = np.concatenate((X_test.toarray(), test[features]), axis=1)

Split train data into train and validation data

In [None]:
X_t, X_v, y_t, y_v = train_test_split(X, y, test_size=0.2, random_state=10)

**Build Neural Network**

In [None]:
from keras.models import Sequential
from keras.layers import Dense
#from keras.callbacks import EarlyStopping

In [None]:
X.shape

In [None]:
model = Sequential()

#early_stopping_monitor = EarlyStopping(patience=1)
model.add(Dense(5, input_shape=(3376,)))
model.add(Dense(32, activation='sigmoid'))
model.add(Dense(32, activation='sigmoid'))
#model.add(Dense(32, activation='leaky_relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.summary()

In [None]:
opt = tf.keras.optimizers.Adam(0.001) #tf.keras.optimizers.SGD(learning_rate=0.1, momentum=1) 

model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
h_callback = model.fit(X_t, y_t, epochs=7, batch_size=50)

In [None]:
plt.plot( h_callback.history['accuracy'])
#plt.plot( h_callback.history['val_accuracy'])
plt.show()

In [None]:
model.evaluate(X_v, y_v)

The model seems to perform reasonably well. To get a final prediction we will fit it again to the whole testing data

In [None]:
X.shape

In [None]:
model1 = Sequential()

#early_stopping_monitor = EarlyStopping(patience=1)
model1.add(Dense(5, input_shape=(3376,)))
model1.add(Dense(32, activation='sigmoid'))
model1.add(Dense(32, activation='sigmoid'))
model1.add(Dense(1, activation='sigmoid'))

model1.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model1.fit(X, dtrain['target'], epochs=7, batch_size=50)

In [None]:
pred1 = np.round(model1.predict(X))

In [None]:
pred1

In [None]:
from sklearn.metrics import f1_score, confusion_matrix

print(f1_score(dtrain['target'], pred1))
print(confusion_matrix(dtrain['target'], pred1))

On the train set the model reaches an accuracy of about 86% and f1_score of about 0.83. However it is trained to overfit slightly as seen from the 0.79 accuracy on the validation set, which is closer to what to expect when applied to the test set.

**Predict and submit**

In [None]:
predictions = np.round(model1.predict(X_test)).astype(int)

In [None]:
sub=pd.DataFrame({'id':dtest['id'].values.tolist(),'target':predictions.ravel()})
sub.to_csv('submission.csv',index=False)

In [None]:
sub