In [None]:
import os
import re
import nltk
import string
import spacy
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from numpy.random import seed
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout 
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.random import set_seed

nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

pd.set_option('display.max_columns', None)

In [None]:
!python -m spacy download en_vectors_web_lg
!python -m spacy link en_vectors_web_lg en_vectors_web_lg_link

In [None]:
nlp = spacy.load('en_vectors_web_lg_link') #('en_core_web_sm')

In [None]:
data = pd.read_csv('../input/sms-spam-collection-dataset/spam.csv', encoding ='windows-1251')[['v1', 'v2']]
data.columns = ['label', 'msg']
data.head()

# Splits

In [None]:
df_else, validation_df  = train_test_split(data,
                                test_size=0.25,
                                random_state = 101)

In [None]:
df_train, df_test  = train_test_split(df_else,
                                test_size=0.25,
                                random_state = 101)

In [None]:
df_train

# Data preparation 

In [None]:
def del_punct(text):
    chars = []
    for char in text:
        if char not in string.punctuation:
            chars.append(char)
        else:
            chars.append(' ')
    return ''.join(chars)

def text_preparation(text: str) -> str:
    text = text.lower()
    text = del_punct(text)
    doc = nlp(text)
    text = ' '.join([
            token.lemma_ 
                for token in doc 
                if token.text not in nlp.Defaults.stop_words 
        ])
    
    text = re.sub(r'\d+', ' somenumbers ', text)
    text = re.sub(r'\s+', ' ', text)
    
    return text
     
def processing(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['msg_len'] = df['msg'].apply(len)
    df['msg'] = df['msg'].apply(text_preparation)
    return df

df_train_p = processing(df_train)
df_test_p = processing(df_test)
df_train_p.head()

# EDA

In [None]:
datasets = {
    'train'        : df_train, 
    'test'         : df_test, 
    'train + test' : df_else, 
    'validation'   : validation_df
}

for dataset_name, dataset in datasets.items():
    print('\n' + dataset_name + ':')
    display(pd.DataFrame(dataset['label'].value_counts()))

In [None]:
g = sns.FacetGrid(df_train_p, hue='label', height = 7, aspect = 2)
g.map(sns.kdeplot, 'msg_len')
plt.legend()
plt.show()

Spam messages tend to be longer

In [None]:
def text_for_cloud(label):
    text = ' '.join(df_train_p['msg'][df_train_p['label'] == label].to_list())
    text = text.replace('somenumbers', '')
    return text

for label in ['spam', 'ham']:
    wordcloud = WordCloud(
        max_font_size=500,
        max_words=100,
        background_color="white"
    ).generate(text_for_cloud(label))

    plt.figure(figsize=(12, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(f'{label.capitalize()} messages wordcloud', fontsize=20)
    plt.show()

# Creating Bag of Words (lemmatized)

In [None]:
bow_transformer = CountVectorizer(max_features = 1500).fit(df_train_p['msg'])
bow_train = bow_transformer.transform(df_train_p['msg'])
bow_train.shape

In [None]:
tfidf_transformer = TfidfTransformer().fit(bow_train)
train_tfidf = tfidf_transformer.transform(bow_train)

bow_test = bow_transformer.transform(df_test_p['msg'])
test_tfidf = tfidf_transformer.transform(bow_test)

In [None]:
X_train = pd.DataFrame.sparse.from_spmatrix(train_tfidf)
X_train.columns = bow_transformer.get_feature_names()
X_train['msg_len'] = df_train_p['msg_len'].values

X_test = pd.DataFrame.sparse.from_spmatrix(test_tfidf) 
X_test.columns = bow_transformer.get_feature_names()
X_test['msg_len'] = df_test_p['msg_len'].values

y_train = df_train_p['label']
y_test = df_test_p['label']

# Scaling

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train.values)
X_test_sc = scaler.transform(X_test.values)

# Dimensionality reduction using autoencoder

In [None]:
input_width = len(X_train.columns)
input_width

In [None]:
def dim_red_analysis(n_epochs):
    seed(101)
    set_seed(101)

    encoder = Sequential()
    encoder.add(Dense(units = 256, activation = 'relu', input_shape = [input_width]))
    encoder.add(Dropout(0.2))
    encoder.add(Dense(units = 16, activation = 'relu'))
    encoder.add(Dense(units = 2, activation = 'relu'))

    decoder = Sequential()
    decoder.add(Dense(units = 16, activation = 'relu', input_shape = [2]))
    decoder.add(Dense(units = 256, activation = 'relu'))
    decoder.add(Dense(units = input_width, activation = 'relu'))

    autoencoder = Sequential([encoder, decoder])

    autoencoder.compile(loss = 'mse', optimizer = SGD(lr = 10))
    
    autoencoder.summary()

    autoencoder.fit(
        X_train_sc, 
        X_train_sc, 
        epochs = n_epochs,
        validation_data=(X_test, X_test)
    )
    
    if n_epochs > 1:
        histo = pd.DataFrame(autoencoder.history.history)
        for metric in ['loss', 'val_loss']:
            plt.title(metric)
            histo[metric].plot()
            plt.show()
        
    encoded_2dim = encoder.predict(X_train_sc)
    encoded_2dim = pd.DataFrame(encoded_2dim)
    encoded_2dim['y'] = df_train['label'].values

    plt.figure(figsize = (12, 8))
    sns.scatterplot(data = encoded_2dim, x = 0, y = 1, hue = 'y', palette = 'magma')
    plt.show()

In [None]:
dim_red_analysis(0)

In [None]:
dim_red_analysis(1)

In [None]:
dim_red_analysis(3)

In [None]:
dim_red_analysis(83)

We can see, that classes are clearly separable with just small overlapping

# Modelling with TF-IDF

In [None]:
def eval_result(model, X_test, y_test):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        pred = model.predict(X_test)
        print(classification_report(y_test, pred, target_names = ['Ham', 'Spam']))
        display(pd.DataFrame(confusion_matrix(y_test, pred), 
                         columns = ['Predicted Ham', 'Predicted Spam'],
                         index = ['Ham', 'Spam']))
        
        print(f'Accuracy: {round(accuracy_score(y_test, pred), 5)}')
        if hasattr(model, 'feature_importances_'):
            features = pd.DataFrame({
                'Variable'  :X_test.columns,
                'Importance':model.feature_importances_
            })
            features.sort_values('Importance', ascending=False, inplace=True)
            display(features.head(20))

In [None]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
eval_result(nb_model, X_test, y_test)

In [None]:
dtc = DecisionTreeClassifier(random_state = 1)
dtc.fit(X_train, y_train)
eval_result(dtc, X_test, y_test)

In [None]:
rfc = RandomForestClassifier(n_jobs = -1, random_state = 1)
rfc.fit(X_train, y_train)
eval_result(rfc, X_test, y_test)

In [None]:
gbc = GradientBoostingClassifier(random_state = 1)
gbc.fit(X_train, y_train)
eval_result(gbc, X_test, y_test)

# Word2Vec

In [None]:
X_train_v = pd.DataFrame([nlp(msg).vector for msg in df_train['msg']])
X_test_v = pd.DataFrame([nlp(msg).vector for msg in df_test['msg']])

In [None]:
X_train_v.shape

In [None]:
rfc = RandomForestClassifier(n_jobs = -1, random_state = 1)
rfc.fit(X_train_v, y_train)
eval_result(rfc, X_test_v, y_test)

In [None]:
gbc = GradientBoostingClassifier(random_state = 1)
gbc.fit(X_train_v, y_train)
eval_result(gbc, X_test_v, y_test)

# Sentiment Analysis

In [None]:
sid = SentimentIntensityAnalyzer()

In [None]:
X_train_sa = pd.DataFrame([sid.polarity_scores(msg) for msg in df_train['msg']])
X_test_sa = pd.DataFrame([sid.polarity_scores(msg) for msg in df_test['msg']])
X_train_sa

In [None]:
rfc = RandomForestClassifier(n_jobs = -1, random_state = 1)
rfc.fit(X_train_sa, y_train)
eval_result(rfc, X_test_sa, y_test)

In [None]:
gbc = GradientBoostingClassifier(random_state = 1)
gbc.fit(X_train_sa, y_train)
eval_result(gbc, X_test_sa, y_test)

# Topic modelling

In [None]:
LDA = LatentDirichletAllocation(n_components=12,random_state=1)
LDA.fit(bow_train)

In [None]:
for index,topic in enumerate(LDA.components_):
    print(f'The top 10 words for topic #{index}')
    print([bow_transformer.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

In [None]:
topic_results_train = pd.DataFrame(LDA.transform(bow_train))
topic_results_test = pd.DataFrame(LDA.transform(bow_test))
print(f'train shape: {topic_results_train.shape}, test shape: {topic_results_test.shape}')

In [None]:
rfc = RandomForestClassifier(n_jobs = -1, random_state = 1)
rfc.fit(topic_results_train, y_train)
eval_result(rfc, topic_results_test, y_test)

In [None]:
gbc = GradientBoostingClassifier(random_state = 1)
gbc.fit(topic_results_train, y_train)
eval_result(gbc, topic_results_test, y_test)

# Word2Vec + TF-IDF + Sentiment data

In [None]:
X_train_full = pd.concat([X_train, X_train_v, X_train_sa], axis=1)
X_test_full = pd.concat([X_test, X_test_v, X_test_sa], axis=1)

In [None]:
X_train_full.shape

In [None]:
rfc = RandomForestClassifier(n_jobs = -1, random_state = 1)
rfc.fit(X_train_full, y_train)
eval_result(rfc, X_test_full, y_test)

In [None]:
gbc = GradientBoostingClassifier(random_state = 1)
gbc.fit(X_train_full, y_train)
eval_result(gbc, X_test_full, y_test)

Didn't use topic modelling, because it worsens the result. The model of choice is GBC.

# Validation

In [None]:
df_val_p = processing(validation_df)

bow_val = bow_transformer.transform(df_val_p['msg'])
val_tfidf = tfidf_transformer.transform(bow_val)

X_val = pd.DataFrame.sparse.from_spmatrix(val_tfidf)
X_val['msg_len'] = df_val_p['msg_len'].values

y_val = df_val_p['label']

X_val_v = pd.DataFrame([nlp(msg).vector for msg in validation_df['msg']])
X_val_sa = pd.DataFrame([sid.polarity_scores(msg) for msg in validation_df['msg']])
X_val_full = pd.concat([X_val, X_val_v, X_val_sa], axis=1)
X_val_full.shape

In [None]:
eval_result(gbc, X_val_full, y_val)