In [None]:
import os
import gc
import re
import string
import operator

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import STOPWORDS
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split

from collections import defaultdict
from collections import Counter

from wordcloud import WordCloud

import warnings
warnings.filterwarnings(action='ignore')
#plt.style.use('fivethirtyeight')\

# 1. Load data

In [None]:
train_df = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')
test_df = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
print('There are {} rows and {} columns in train'.format(train_df.shape[0],train_df.shape[1]))
print('There are {} rows and {} columns in test'.format(test_df.shape[0],test_df.shape[1]))

* Check Missing value

In [None]:
train_df.isnull().sum().sort_values(ascending=False)

In [None]:
test_df.isnull().sum().sort_values(ascending=False)

In [None]:
train_df = train_df.dropna()
test_df = test_df.dropna() # Although there is no missing value in test dataset, perform the dropna

In [None]:
train_df.isnull().sum().sort_values(ascending=False)

* Number of Labels

In [None]:
x=train_df.sentiment.value_counts()
ax = sns.barplot(x.index,x)
for i, v in enumerate(x.iteritems()):        
    ax.text(i ,v[1], "{:,}".format(v[1]), ha='center', va ='bottom', fontsize=10, color='black', rotation=0)
plt.gca().set_ylabel('tweets')

* Number of characters

In [None]:
fig,(ax1,ax2,ax3) = plt.subplots(1,3,figsize=(20,5))
train_len = train_df[train_df['sentiment']=='neutral']['text'].str.len()
ax1.hist(train_len,color='red')
ax1.set_title('Neutral tweets')

train_len = train_df[train_df['sentiment']=='positive']['text'].str.len()
ax2.hist(train_len,color='blue')
ax2.set_title('Positive tweets')

train_len = train_df[train_df['sentiment']=='negative']['text'].str.len()
ax3.hist(train_len, color='green')
ax3.set_title('Negative tweets')
fig.suptitle('Characters in tweets')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
sns.kdeplot(train_df[train_df['sentiment']=='neutral']['text'].str.len())
sns.kdeplot(train_df[train_df['sentiment']=='positive']['text'].str.len())
sns.kdeplot(train_df[train_df['sentiment']=='negative']['text'].str.len())
plt.title("Distribution of Tweets")
ax.legend(labels=["Neutral","Positive","Negative"])

* #### Number of words

In [None]:
fig,(ax1,ax2,ax3) = plt.subplots(1,3,figsize=(20,5))
train_len = train_df[train_df['sentiment']=='neutral']['text'].str.split().map(lambda x: len(x))
ax1.hist(train_len,color='red')
ax1.set_title('Neutral tweets')

train_len = train_df[train_df['sentiment']=='positive']['text'].str.split().map(lambda x: len(x))
ax2.hist(train_len,color='blue')
ax2.set_title('Positive tweets')

train_len = train_df[train_df['sentiment']=='negative']['text'].str.split().map(lambda x: len(x))
ax3.hist(train_len, color='green')
ax3.set_title('Negative tweets')
fig.suptitle('Words in tweets')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
sns.kdeplot(train_df[train_df['sentiment']=='neutral']['text'].str.split().map(lambda x: len(x)))
sns.kdeplot(train_df[train_df['sentiment']=='positive']['text'].str.split().map(lambda x: len(x)))
sns.kdeplot(train_df[train_df['sentiment']=='negative']['text'].str.split().map(lambda x: len(x)))
plt.title("Distribution of Tweets")
ax.legend(labels=["Neutral","Positive","Negative"])

# 2. Meta data analysis
The dataset is needed to be cleaned. To check the effect of cleaning, we perform the data analysis before.<br>
[Reference: NLP with Disaster Tweets](https://www.kaggle.com/gunesevitan/nlp-with-disaster-tweets-eda-cleaning-and-bert)

* Meta data

In [None]:
# word_count
train_df['word_count'] = train_df['text'].apply(lambda x: len(str(x).split()))
test_df['word_count'] = test_df['text'].apply(lambda x: len(str(x).split()))

# unique_word_count
train_df['unique_word_count'] = train_df['text'].apply(lambda x: len(set(str(x).split())))
test_df['unique_word_count'] = test_df['text'].apply(lambda x: len(set(str(x).split())))

# stop_word_count
train_df['stop_word_count'] = train_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))
test_df['stop_word_count'] = test_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))

# url_count
train_df['url_count'] = train_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))
test_df['url_count'] = test_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))

# mean_word_length
train_df['mean_word_length'] = train_df['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test_df['mean_word_length'] = test_df['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

# char_count
train_df['char_count'] = train_df['text'].apply(lambda x: len(str(x)))
test_df['char_count'] = test_df['text'].apply(lambda x: len(str(x)))

# punctuation_count
train_df['punctuation_count'] = train_df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
test_df['punctuation_count'] = test_df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

# hashtag_count
train_df['hashtag_count'] = train_df['text'].apply(lambda x: len([c for c in str(x) if c == '#']))
test_df['hashtag_count'] = test_df['text'].apply(lambda x: len([c for c in str(x) if c == '#']))

# mention_count
train_df['mention_count'] = train_df['text'].apply(lambda x: len([c for c in str(x) if c == '@']))
test_df['mention_count'] = test_df['text'].apply(lambda x: len([c for c in str(x) if c == '@']))

In [None]:
METAFEATURES = ['word_count', 'unique_word_count', 'stop_word_count', 'url_count', 'mean_word_length',
                'char_count', 'punctuation_count', 'hashtag_count', 'mention_count']

neu = train_df['sentiment'] == 'neutral'
pos = train_df['sentiment'] == 'positive'
neg = train_df['sentiment'] == 'negative'

fig, axes = plt.subplots(ncols=2, nrows=len(METAFEATURES), figsize=(30, 50))

for i, feature in enumerate(METAFEATURES):
    sns.distplot(train_df.loc[neu][feature], label='Neutral', ax=axes[i][0], color='green')
    sns.distplot(train_df.loc[pos][feature], label='Positive', ax=axes[i][0], color='blue')
    sns.distplot(train_df.loc[neg][feature], label='Negative', ax=axes[i][0], color='red')

    sns.distplot(train_df[feature], label='Training', ax=axes[i][1], color='blue')
    sns.distplot(test_df[feature], label='Test', ax=axes[i][1], color='yellow')
    
    for j in range(2):
        axes[i][j].set_xlabel('')
        axes[i][j].tick_params(axis='x', labelsize=10)
        axes[i][j].tick_params(axis='y', labelsize=10)
        axes[i][j].legend()
    
    axes[i][0].set_title(f'{feature} Target Distribution in Training Set', fontsize=13)
    axes[i][1].set_title(f'{feature} Training & Test Set Distribution', fontsize=13)

plt.show()

* N-gram

In [None]:
def generate_ngrams(text, n_gram=1):
    token = [token for token in text.lower().split(' ') if token != '' if token not in STOPWORDS]
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [' '.join(ngram) for ngram in ngrams]

N = 30

In [None]:
def ngram_cal(df, n_gram):
    neutral_ngrams = defaultdict(int)
    positive_ngrams = defaultdict(int)
    negative_ngrams = defaultdict(int)
    
    for tweet in df[neu]['text']:
        for word in generate_ngrams(tweet, n_gram=n_gram):
            neutral_ngrams[word] += 1
        
    for tweet in df[pos]['text']:
        for word in generate_ngrams(tweet, n_gram=n_gram):
            positive_ngrams[word] += 1
            
    for tweet in df[neg]['text']:
        for word in generate_ngrams(tweet, n_gram=n_gram):
            negative_ngrams[word] += 1
    
    df_neutral_ngrams = pd.DataFrame(sorted(neutral_ngrams.items(), key=lambda x: x[1])[::-1])
    df_positive_ngrams = pd.DataFrame(sorted(positive_ngrams.items(), key=lambda x: x[1])[::-1])
    df_negative_ngrams = pd.DataFrame(sorted(negative_ngrams.items(), key=lambda x: x[1])[::-1])
    
    return df_neutral_ngrams, df_positive_ngrams, df_negative_ngrams

In [None]:
def ngram_plot(df_gram1, df_gram2, df_gram3, U):
    fig, axes = plt.subplots(ncols=3, figsize=(30, 10))
    plt.tight_layout()

    sns.barplot(y=df_gram1[0].values[:N], x=df_gram1[1].values[:N], ax=axes[0], color='green')
    sns.barplot(y=df_gram2[0].values[:N], x=df_gram2[1].values[:N], ax=axes[1], color='blue')
    sns.barplot(y=df_gram3[0].values[:N], x=df_gram3[1].values[:N], ax=axes[2], color='red')

    for i in range(3):
        axes[i].spines['right'].set_visible(False)
        axes[i].set_xlabel('')
        axes[i].set_ylabel('')
        axes[i].tick_params(axis='x', labelsize=10)
        axes[i].tick_params(axis='y', labelsize=10)

    axes[0].set_title(f'Top {N} most common {U}-grams in Neutral Tweets', fontsize=15)
    axes[1].set_title(f'Top {N} most common {U}-grams in Positive Tweets', fontsize=15)
    axes[2].set_title(f'Top {N} most common {U}-grams in Negative Tweets', fontsize=15)

    plt.show()

N-grams in Train dataset

In [None]:
a, b, c = ngram_cal(train_df, 1)
ngram_plot(a, b, c, 1)
a, b, c = ngram_cal(train_df, 2)
ngram_plot(a, b, c, 2)
a, b, c = ngram_cal(train_df, 3)
ngram_plot(a, b, c, 3)

N-grams in Test dataset

In [None]:
neu = test_df['sentiment'] == 'neutral'
pos = test_df['sentiment'] == 'positive'
neg = test_df['sentiment'] == 'negative'

a, b, c = ngram_cal(test_df, 1)
ngram_plot(a, b, c, 1)
a, b, c = ngram_cal(test_df, 2)
ngram_plot(a, b, c, 2)
a, b, c = ngram_cal(test_df, 3)
ngram_plot(a, b, c, 3)

# 3. Data cleaning

* Combine Dataset <br>
Before data cleaning, to perform at once, train dataset and test dataset are combined.<br>
The label in train dataset is seperated to <code>label</code>.

In [None]:
train_df.head()

In [None]:
# Seperate Label
label = train_df['selected_text'].values
train_df = train_df.drop(['selected_text'], axis=1)

# Drop useless data from ngrams
#METAFEATURES = ['word_count', 'unique_word_count', 'stop_word_count', 'url_count', 'mean_word_length',
#                'char_count', 'punctuation_count', 'hashtag_count', 'mention_count']
train_df = train_df.drop(METAFEATURES, axis=1)
test_df = test_df.drop(METAFEATURES, axis=1)

# Drop the irrevelant parameter
train_df = train_df.drop(['textID'], axis=1)
test_df = test_df.drop(['textID'], axis=1)

In [None]:
train_df.head()

In [None]:
# Combine dataset
df = pd.concat([train_df, test_df], axis=0)
print('There are {} rows and {} columns in train'.format(train_df.shape[0],train_df.shape[1]))
print('There are {} rows and {} columns in test'.format(test_df.shape[0],test_df.shape[1]))
print('There are {} rows and {} columns in total'.format(df.shape[0],df.shape[1]))

# Save the size of train and test dataset
train_size = train_df.shape[0]
test_size = test_df.shape[0]

df.head(5)

* Remove Url / Html / Emoji / Punct

In [None]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_punct(text):
    table = str.maketrans('','',string.punctuation)
    return text.translate(table)

In [None]:
df['text'] = df['text'].apply(lambda x : remove_URL(x))
df['text'] = df['text'].apply(lambda x : remove_html(x))
df['text'] = df['text'].apply(lambda x : remove_emoji(x))
df['text'] = df['text'].apply(lambda x : remove_punct(x))

* Remove Tags

In [None]:
def remove_tags(text):
    tag_pattern = re.compile(r'[@|#][^\s]+')
    return tag_pattern.sub(r'',text)
#    return text + ' ' + ' '.join(tags) + ' '+ ' '.join(tags) + ' ' + ' '.join(tags)

In [None]:
df['text'] = df['text'].apply(lambda x : remove_tags(x))

* Remove Stopwords

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
stemmer  = SnowballStemmer('english')
stopword = stopwords.words('english')

def Remove_StopAndStem(text):
    string_list = text.split()
    return ' '.join([stemmer.stem(i) for i in string_list if i not in stopword])

In [None]:
df['text'] = df['text'].apply(Remove_StopAndStem)

In [None]:
df.head()

* Word Cloud

In [None]:
dict_of_words = {}
for row in  df.itertuples():
    for i in row[1].split():
        try:
            dict_of_words[i] += 1
        except:
            dict_of_words[i] = 1

#Initializing  WordCloud
wordcloud = WordCloud(background_color = 'black', width=1000, height=500).generate_from_frequencies(dict_of_words)
fig = plt.figure(figsize=(10,5))
plt.imshow(wordcloud)
plt.tight_layout(pad=1)
plt.show()

# 4. Cleaning result

In [None]:
re_train = df[:train_size]
re_test = df[train_size:]

print('There are {} rows and {} columns in train'.format(re_train.shape[0],re_train.shape[1]))
print('There are {} rows and {} columns in test'.format(re_test.shape[0],re_test.shape[1]))
print('Original train dataset : {} \nOriginal test dataset: {}'.format(train_size, test_size))

* Train dataset

In [None]:
neu = re_train['sentiment'] == 'neutral'
pos = re_train['sentiment'] == 'positive'
neg = re_train['sentiment'] == 'negative'

a, b, c = ngram_cal(re_train, 1)
ngram_plot(a, b, c, 1)
a, b, c = ngram_cal(re_train, 2)
ngram_plot(a, b, c, 2)
a, b, c = ngram_cal(re_train, 3)
ngram_plot(a, b, c, 3)

* Test dataset

In [None]:
neu = re_test['sentiment'] == 'neutral'
pos = re_test['sentiment'] == 'positive'
neg = re_test['sentiment'] == 'negative'

a, b, c = ngram_cal(re_test, 1)
ngram_plot(a, b, c, 1)
a, b, c = ngram_cal(re_test, 2)
ngram_plot(a, b, c, 2)
a, b, c = ngram_cal(re_test, 3)
ngram_plot(a, b, c, 3)

* Split Train and Validation dataset

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(re_train, label, test_size=0.2, random_state=42)

In [None]:
vectorizer = TfidfVectorizer(min_df = 0.0005, 
                             max_features = 100000, 
                             tokenizer = lambda x: x.split(),
                             ngram_range = (1,4))


X_train = vectorizer.fit_transform(X_train['text'])
X_valid = vectorizer.transform(X_valid['text'])

In [None]:
print("Training Points: ", len(X_train.toarray()),"| Training Features:" , len(X_train.toarray()[0]))
print("Testing Points: ", len(X_valid.toarray()),"| Testing Features:" , len(X_valid.toarray()[0]))
print()
print("Training Points: ", len(y_train))
print("Testing Points: ", len(y_valid))