reference:

https://www.kaggle.com/vbmokin/nlp-eda-bag-of-words-tf-idf-glove-bert

## 0. Import libraries <a class="anchor" id="2"></a>

In [None]:
import pandas as pd
import numpy as np

import re
import string

import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

from wordcloud import WordCloud

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.preprocessing import MinMaxScaler
from sklearn import decomposition
from sklearn.neighbors import KNeighborsClassifier

import seaborn as sns

## 1. Explore Data <a class="anchor" id="3"></a>

In [None]:
import pandas as pd
import numpy as np

train_raw = pd.read_csv('../input/nlp-getting-started/train.csv')
test_raw = pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
train_raw.shape, test_raw.shape

In [None]:
train_raw[:3]

In [None]:
train_raw.info()

In [None]:
test_raw.info()

What is the difference bwetween target value 1 and 0?
Let's take a glimpse.

In [None]:
for i in range(10):
    example = train_raw[ train_raw['target'] == 0 ]['text'][:10].tolist()
    print(example[i])

In [None]:
for i in range(10):
    example = train_raw[ train_raw['target'] == 1 ]['text'][:10].tolist()
    print(example[i])

It looks like the texts are classified by some particular keywords. For example, ''Target 1 text' has the words which has something to do with disaster such as evacuation, earthquake or fire.

## 1.1. Clean Data

Before go any deeper, I'll clean the text first so we can only handle the words relevant.

In [None]:
df=pd.concat([train_raw,test_raw], sort=True, axis=0, ignore_index=True)
df.shape

In [None]:
import re

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)
df['text']=df['text'].apply(lambda x : remove_URL(str(x)))

In [None]:
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

df['text']=df['text'].apply(lambda x : remove_html(str(x)))


In [None]:
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

df['text']=df['text'].apply(lambda x : remove_emoji(str(x)))

In [None]:
import string

def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

df['text']=df['text'].apply(lambda x : remove_punct(x))

In [None]:
df[:7]

In [None]:
train = df.loc[:len(train_raw)-1, ['text', 'target']]
test = df.loc[len(train_raw):, ['text', 'target']]
train.shape, test.shape

Would there be any difference in text length?

In [None]:
train['length'] = train['text'].apply(lambda x: len(x.split()))
train[:2]

In [None]:
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = (20, 6)
bins = 200
plt.hist(train[train['target'] == 0]['length'], alpha = 0.6, bins=bins, label='0')
plt.hist(train[train['target'] == 1]['length'], alpha = 0.8, bins=bins, label='1')
plt.legend(loc='upper right')
plt.xlim(0,train['length'].max())
plt.show()

In general, the text of target 1 is slightly shorter than those of target 0.

What are the top 10 words that shows frequently in target 0 and target 1?

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

corpus_0 = []
corpus_1 = []
for i in range(len(train)):
    words = word_tokenize(train.iloc[i]['text'])
    for word in words:
        word = word.lower()
        if (word not in stop_words) and (word not in string.punctuation):
            if train.iloc[i]['target'] == 0:
                corpus_0.append(word)
            else:
                corpus_1.append(word)

print(len(corpus_0))
print(len(corpus_1))

In [None]:
from collections import Counter

counter_0=Counter(corpus_0)
most_0=counter_0.most_common()
most_0[:10]

In [None]:
counter_1=Counter(corpus_1)
most_1=counter_1.most_common()
most_1[:10]

Thanks to https://www.geeksforgeeks.org/generating-word-cloud-python/

In [None]:
comment_words_list0 = []

for i in range(len(corpus_0)):
    word = corpus_0[i]
    comment_words_list0.append(word)
comment_words_0 = ' '.join(comment_words_list0)

In [None]:
from wordcloud import WordCloud

wordcloud_0 = WordCloud(width = 800, height = 800,
                background_color ='white',
                min_font_size = 10).generate(comment_words_0)

plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud_0)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.title('Target 0', fontname = 'monospace')
plt.show()

In [None]:
comment_words_list1 = []
for i in range(len(corpus_1)):
    word = corpus_1[i]
    comment_words_list1.append(word)
comment_words_1 = ' '.join(comment_words_list1)

wordcloud_1 = WordCloud(width = 800, height = 800,
                background_color ='white',
                min_font_size = 10).generate(comment_words_1)

# plot the WordCloud image                      
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud_1)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.title('Target 1', fontname = 'monospace')
plt.show()

'Target 0' contains more about self and personal life while 'Target 1' is more about outter world issue or accident.

## Vectorize features

With clean version of text,
convert our features(each words in the text) to vectors.

What mapping method would be good?

1) CountVectorizer

Countvectorizer counts(using one hot encoding) how many times the given word apperas in the text.

In [None]:
 from sklearn.model_selection import train_test_split

train_X = train["text"].values
train_Y = train["target"].values
test_X = test["text"].values

random_state_split = 42
train_x, val_x, train_y, val_y = train_test_split(train_X, train_Y, 
                                                  test_size=0.2, shuffle=True,
                                                  random_state=random_state_split)
print(train_x.shape, train_y.shape, val_x.shape, val_y.shape)

In [None]:
train_x[:3]

Let's see the feature matrix.
take some sample.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer_5 =  CountVectorizer(min_df=0., max_df=1.0)
train_x_counts_5 = count_vectorizer_5.fit_transform(train_x[:5])
sample = pd.DataFrame(train_x_counts_5.A, columns=count_vectorizer_5.get_feature_names())
sample

In [None]:
len(count_vectorizer_5.vocabulary_)

After finishing mapping of all train_x, the number of features ends up with 15470.

In [None]:
count_vectorizer = CountVectorizer(min_df=0., max_df=1.0)
train_x_counts = count_vectorizer.fit_transform(train_x)
val_x_counts = count_vectorizer.transform(val_x)

In [None]:
print('Input of Train feature vector matrix shape: {}'.format(train_x_counts.shape))
print('Input of Validation feature vector matrix shape: {}'.format(val_x_counts.shape))

The number of features(15470) is greater than those of examples(6090).

If the matrix is too fat, learning is not easy.

We need to do dimensionality reduction like PCA or manifold.

Here, I'll directly select the important features using 'most_0' and 'most_1'.

If key word is appeared in corpus less than three times, remove it. 

Because if the frequency is more than 2, it can be taken as not coincidence.

In [None]:
most_1_dict = dict(most_1)
print(len(most_1_dict))
list(most_1_dict.keys())[-10:]

The less frequent words seem like not very informative for model to learn.

In [None]:
for i in list(most_1_dict.keys()):
    if most_1_dict[i] < 3:
        del most_1_dict[i]
print(len(most_1_dict))
list(most_1_dict.keys())[-20:]

After deleting those words, the less frequent words gets closer to 'disaster'.

Our model might see these words useful.

In [None]:
most_0_dict = dict(most_0)
for i in list(most_0_dict.keys()):
    if most_0_dict[i] < 3:
        del most_0_dict[i]
print(len(most_0_dict))
list(most_0_dict.keys())[-20:]

To vectorizer, feed the revised version of data.

In [None]:
# combine two dictionaries
most_dict = most_0_dict.copy()
most_dict.update(most_1_dict)
len(most_dict.keys())

In [None]:
revised_train = train.copy()
for index in range(len(revised_train)):
    text = revised_train.loc[index, 'text']
    words = str(text).split()
    revised_text = []
    for word in words:
        if word in list(most_dict.keys()):
            revised_text.append(word)
    revised_train.at[index, 'text'] = ' '.join(revised_text)

In [None]:
print('origianl       : ', train.loc[6879, 'text'])
print('revised version: ', revised_train.loc[6879, 'text'])

In [None]:
print('origianl       : ', train.loc[5781, 'text'])
print('revised version: ', revised_train.loc[5781, 'text'])

In [None]:
print('origianl       : ', train.loc[0, 'text'])
print('revised version: ', revised_train.loc[0, 'text'])

After revising, relatively essential key words are left.

In [None]:
train_X = revised_train["text"].values
train_Y = revised_train["target"].values

random_state_split = 42
train_x_revised, val_x_revised, train_y_revised, val_y_revised = train_test_split(
    train_X, train_Y, 
    test_size=0.2, shuffle=False,
    random_state=random_state_split)

count_vectorizer_new = CountVectorizer(min_df=0., max_df=1.0)
train_x_counts_revised = count_vectorizer_new.fit_transform(train_x_revised)
val_x_counts_revised = count_vectorizer_new.transform(val_x_revised)

train_x_counts_revised.shape, val_x_counts_revised.shape

In [None]:
len(count_vectorizer_new.vocabulary_)

In [None]:
train_x_df = pd.DataFrame(train_x_counts_revised.A, 
                          columns=count_vectorizer_new.get_feature_names())
train_x_df

Before 15470, now the feature size gets down to 2902.

In [None]:
val_x_df = pd.DataFrame(val_x_counts_revised.A, 
                          columns=count_vectorizer_new.get_feature_names())
val_x_df

We have a saprse matrix. 

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaler.fit(train_x_df.values)
train_scaled = scaler.transform(train_x_df.values)
val_scaled = scaler.transform(val_x_df.values)

In [None]:
from sklearn import decomposition
from sklearn.neighbors import KNeighborsClassifier

svd = decomposition.TruncatedSVD(algorithm='randomized', 
                                 n_iter=10, random_state=0, tol=0.0)
train_pca = svd.fit_transform(train_scaled)
val_pca = svd.transform(val_scaled)

In [None]:
neigh = KNeighborsClassifier(n_neighbors=290, algorithm='kd_tree')
neigh.fit(train_pca, train_y) 

neigh.score(val_pca, val_y)

The Accuracy is not good. 

Rather than doing a further parameter tunning or finding another model with this matrix,

in order to find a good classifier,

we need a better feature representation.

Pre-trained one, like Bert, converts 'the given words' to vectors quite well.

In [None]:
# Bert

## To be continued...

Thank you for reading :)

[Go to Top](#0)