In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Only run once

os.chdir('..')

basepath = os.getcwd()

data_train = pd.read_csv(basepath + '/data/train.csv')
data_test = pd.read_csv(basepath + '/data/test.csv')

print(f"Training set shape: {data_train.shape}\n")
print(f"Test set shape: {data_test.shape}\n")

Training set shape: (7613, 5)

Test set shape: (3263, 4)



In [3]:
display(data_train.head())
data_train.info()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [8]:
data_train.keyword.value_counts()

fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 221, dtype: int64

In [12]:
data_key = data_train[data_train.keyword.isna() == False]

In [46]:
data_key.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7552 entries, 31 to 7582
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              7552 non-null   int64 
 1   keyword         7552 non-null   object
 2   location        5080 non-null   object
 3   text            7552 non-null   object
 4   target          7552 non-null   int64 
 5   text_clean      7552 non-null   object
 6   text_len        7552 non-null   int64 
 7   text_clean_len  7552 non-null   int64 
dtypes: int64(4), object(4)
memory usage: 789.0+ KB


In [13]:
import nltk
import re
from unidecode import unidecode

In [16]:
regxcache={
# precompile regular expressions for faster preprocessing
"<.*?>" : re.compile("<.*?>"),
"&.{1,9};" : re.compile("&.{1,9};"),
"[^a-z]+" : re.compile("[^a-z]+"),
"\s[a-z]\s+" : re.compile("\s[a-z]\s+"),
"\s+" : re.compile("\s+"),
}

def preprocess(doc):
    """
    Normalize articles.

    1. lowercase
    2. only single whitespace
    3. unicode to ascii
    4. delete punctuation and numbers
    5. delete html tags <...>
    6. delete html character (like &nbsp;)
    7. delete stray single characters
    """
    # Lowercase
    doc = doc.lower()
    # expand negations
    doc = doc.replace("n't"," not")
    # Remove leading/trailing whitespace
    doc = doc.strip()
    # Convert Unicode into ASCII
    doc = unidecode(doc)
    # Remove HTML tags:
    doc = regxcache["<.*?>"].sub(" ", doc)
    # remove &nbsp; and other HTML codes up to 9 characters long
    doc = regxcache["&.{1,9};"].sub(" ", doc)
    # Remove punctuation and Numbers
    doc = regxcache["[^a-z]+"].sub( " ", doc)
    # Remove stray single characters
    doc = regxcache["\s[a-z]\s+"].sub( " ", doc)
    # Remove extra whitespace
    doc = regxcache["\s+"].sub( " ", doc)
    return doc

In [17]:
data_key["text_clean"] = data_key.text.apply(preprocess)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_key["text_clean"] = data_key.text.apply(preprocess)


In [19]:
data_key.text_clean[33]

' africanbaze breaking news nigeria flag set ablaze in aba http co nndbgwyei'

In [21]:
data_key["text_len"] = data_key.text.apply(len)

data_key.groupby("keyword").text_len.sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_key["text_len"] = data_key.text.apply(len)


keyword
ablaze                 3409
accident               3503
aftershock             3441
airplane%20accident    3724
ambulance              3739
                       ... 
wounded                4109
wounds                 3411
wreck                  3276
wreckage               4768
wrecked                3009
Name: text_len, Length: 221, dtype: int64

In [23]:
data_key["text_clean_len"] = data_key.text_clean.apply(len)

data_key.groupby("keyword").text_clean_len.sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_key["text_clean_len"] = data_key.text_clean.apply(len)


keyword
ablaze                 3084
accident               3126
aftershock             3041
airplane%20accident    3385
ambulance              3414
                       ... 
wounded                3787
wounds                 3165
wreck                  3003
wreckage               4248
wrecked                2726
Name: text_clean_len, Length: 221, dtype: int64

In [30]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [32]:
tfidf = Pipeline([
    ("count", CountVectorizer(max_features = 5000, stop_words = "english")),
    ("tfidf", TfidfTransformer())
])

master = Pipeline([
    ("vect", tfidf),
    ("scaler", StandardScaler(with_mean = False)),
    ("clf", MultinomialNB())
])

In [51]:
X = data_key["text_clean"]
Y = data_key["target"].astype("bool")

In [56]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [57]:
X_train.shape

(6041,)

In [58]:
Y_train.shape

(6041,)

In [59]:
master.fit(X_train, Y_train)

Pipeline(steps=[('vect',
                 Pipeline(steps=[('count',
                                  CountVectorizer(max_features=5000,
                                                  stop_words='english')),
                                 ('tfidf', TfidfTransformer())])),
                ('scaler', StandardScaler(with_mean=False)),
                ('clf', MultinomialNB())])

In [60]:
Y_pred = master.predict(X_test)

In [61]:
from sklearn.metrics import accuracy_score, f1_score

In [62]:
print("Accuracy score:", accuracy_score(Y_test, Y_pred))
print("F1-score:", f1_score(Y_test, Y_pred))

Accuracy score: 0.7557908669755129
F1-score: 0.7176740627390973
