This notebook is made for a light practice in **computational-linguistics class**. Feel free to use! 
I'd also appreciate any comment, feedback, and question :D

In [None]:
# Data preparation
import pandas as pd

df = pd.read_csv('../input/sms-spam-collection-dataset/spam.csv', encoding='latin1')
df.tail(2)

In [None]:
df = df[['v1', 'v2']]
df.rename({'v1': 'target', 'v2': 'text'}, axis='columns', inplace=True)
df

In [None]:
X = df[['text']]
y = df[['target']]
X.tail(5)

In [None]:
y.tail(5)

In [None]:
# Check null data
print(df.isnull().sum())

### EDA

In [None]:
import matplotlib.pyplot as plt

# Check whether this data is imbalanced or not
plt.figure(figsize=(20, 10))
df['target'].value_counts().plot(kind='bar')
plt.title('Ham versus Spam ratio')
plt.show()

df.groupby('target').count().style.background_gradient(cmap='Blues')

In [None]:
# stopwords
from tensorflow.keras.preprocessing.text import Tokenizer

tk = Tokenizer()
tk.fit_on_texts(X.text)

total_count = len(tk.word_index)
rare_count = 0

for k, v in tk.word_counts.items():
    if (v < 2):
        rare_count = rare_count + 1

print('total number of words : ', total_count, 
      'number of sparse words : ', rare_count)

In [None]:
tk = Tokenizer(num_words=total_count-rare_count+1)
tk.fit_on_texts(X.text) 
X_data = tk.texts_to_sequences(X.text) 
X_data[0][:5]

In [None]:
# Vocabularay

sent = [None] * 5
for k, v in tk.word_index.items():
    if v == 50:
        sent[0] = k
    if v == 469:
        sent[1] = k
    if v == 841:
        sent[2] = k
    if v == 751:
        sent[3] = k
    if v == 657:
        sent[4] = k
print(sent)

In [None]:
# Frequency top5 in spam
X_spam = df[df['target'] == 'spam']['text']
tk2 = Tokenizer()
tk2.fit_on_texts(X_spam)
sequences = tk2.texts_to_sequences(X_spam)
rank5 = sorted(tk2.word_counts.items(), key=lambda item: item[1], reverse=True)[:5]

print('spam Top 5')
for i, (w, f) in enumerate(rank5):
    print('{}위 : '.format(i+1) + w, f)

In [None]:
# Frequency top5 in ham
X_ham = df[df['target'] == 'ham']['text']
tk2 = Tokenizer()
tk2.fit_on_texts(X_ham)
sequences = tk2.texts_to_sequences(X_ham)
rank5 = sorted(tk2.word_counts.items(), key=lambda item: item[1], reverse=True)[:5]

print('ham Top 5')
for i, (w, f) in enumerate(rank5):
    print('{}위 : '.format(i+1) + w, f)

In [None]:
import numpy as np
import seaborn as sns

# visualization
mail_length = [len(x) for x in X_data] # 중복이 좀 많음

print('max : ', np.max(mail_length))
print('mean : ', np.mean(mail_length))
print('-' * 100)

plt.figure(figsize=(10, 6))
sns.distplot(mail_length, bins=50)
plt.title('Distribution')
plt.xlabel('Word count')
plt.show()

In [None]:
# last preparation
from tensorflow.keras.preprocessing.sequence import pad_sequences


X = pd.DataFrame(pad_sequences(X_data, maxlen=183))
y.target = y.target.factorize()[0]
y

### Machine Learning

In [None]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=0)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Embedding, Dense


model = Sequential()
model.add(Embedding(total_count, 32))
model.add(SimpleRNN(32))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
print('embedding : ', 8920 * 32)
print('simple_rnn : ', (32 * 32) + (32 * 32) + 32)
print('dense : ', 32 * 1 + 1)

In [None]:
epochs = 10
history = model.fit(train_X, train_y, epochs=epochs, validation_split=0.2, batch_size=64, verbose=0)

In [None]:
def plot_curve():
    f, axs = plt.subplots(1, 2, figsize=(10, 5))
    axs[0].plot(history.history['loss'], label='loss')
    axs[0].plot(history.history['val_loss'], label='val_loss')
    axs[0].legend()
    axs[0].set_title('Loss Curve')
    axs[1].plot(history.history['accuracy'], label='accuracy')
    axs[1].plot(history.history['val_accuracy'], label='val_accuracy')
    axs[1].legend()
    axs[1].set_title('Accuracy Curve')
    plt.show()
plot_curve()

In [None]:
model = Sequential()
model.add(Embedding(total_count, 32))
model.add(SimpleRNN(32))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

epochs = 3
history = model.fit(train_X, train_y, epochs=epochs, validation_split=0.2, batch_size=64)

In [None]:
from sklearn.metrics import classification_report, accuracy_score

mus = model.predict(test_X)
y_pred = [np.where(mu >= 0.5, 1, 0) for mu in mus]
print(classification_report(test_y, y_pred))
print('-' * 100)
print('정확도 : ', accuracy_score(test_y, y_pred))

### Imbalanced data

In [None]:
train_X.shape, train_y.shape

In [None]:
from sklearn.decomposition import PCA 

pca = PCA(n_components=2)
X_2d = pca.fit_transform(train_X)
X_2d = pd.DataFrame(X_2d, columns=['x', 'y'])


df_2d = pd.concat([X_2d, train_y.reset_index(drop=True)], axis=1)

mask0 = df_2d['target'] == 0
mask0 = df_2d['target'] == 1

plt.figure(figsize=(10, 5))
sns.scatterplot(x='x', y='y', data=df_2d, hue='target', s=12)
plt.title('ham + spam')
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
df_2d.groupby('target').size().plot(kind='bar')
plt.show()

In [None]:
from imblearn.over_sampling import *
from tensorflow.keras.regularizers import l2

# try oversampling
blcd_X, blcd_y = SMOTE(random_state=0).fit_resample(train_X, train_y)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Embedding, Dense

model = Sequential()
model.add(Embedding(total_count, 32))
model.add(SimpleRNN(32))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

epochs = 10
history = model.fit(blcd_X, blcd_y, epochs=epochs, validation_split=0.2, batch_size=64)

plot_curve()

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_2d = pca.fit_transform(blcd_X)
X_2d = pd.DataFrame(X_2d, columns=['x', 'y'])

df_2d = pd.concat([X_2d, blcd_y.reset_index(drop=True)], axis=1)

plt.figure(figsize=(10, 5))
sns.scatterplot(x='x', y='y', data=df_2d, hue='target', alpha=0.8)
plt.title('ham + spam')
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
df_2d.groupby('target').size().plot(kind='bar')
plt.show()

In [None]:
mus = model.predict(test_X)
y_pred = [np.where(mu >= 0.5, 1, 0) for mu in mus]
print(classification_report(test_y, y_pred))
# Check recall rate

In [None]:
text = '''
How are you doing today? I am Mr. Fong pau teck a staff of a reputable financial institution here in Malaysia.
An investment was placed under my management. I need your assistance in investing the fund in your country into a good business.
If you are interested reply back, so I can forward you with more details.
'''

def predict_spam(text):
    from math import ceil
    seq = pd.DataFrame(tk.texts_to_sequences(text)[:183]).fillna(value=0).T
    prob = model.predict(seq)
    (result, belief) = ('spam', prob) if prob >= 0.5 else ('ham', 1-prob)
    
    print('Belief {}% - {}.'.format(round(float(belief), 2) * 100, result))

In [None]:
predict_spam(text)

In [None]:
text = '''
Your business should be accepting Credit Cards from your customers!
Increase your sales by 30, 40 even 50 percent by accepting Credit Cards
'''

predict_spam(text)