In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

In [None]:
data = pd.read_csv("../input/sms-spam-collection-dataset/spam.csv")
data = data[['v1', 'v2']]
data = data.rename(columns = {'v1': 'label', 'v2': 'text'})
data.head(10)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.countplot(data=data, x='label')

# Create feature for text message length

In [None]:
data['length'] = data['text'].apply(lambda x: len(x) - x.count(" "))
# data["length"] = data["text"].apply(len)
data.head()

In [None]:
# sns.displot(data=data, x='length', hue='label', bins=np.linspace(0, 200, 20), fill=True, aspect=1.5, alpha=0.5)

plt.figure(figsize=(10, 5))
bins = np.linspace(0, 200, 40)
plt.hist(data[data['label']=='ham']['length'], bins, alpha=0.5, label='ham')
plt.hist(data[data['label']=='spam']['length'], bins, alpha=0.5, label='spam')
plt.legend(loc='upper left')

# Create feature for % of text that is punctuation

In [None]:
import string

def count_punct(text):
#     count = sum([1 for char in text if char in string.punctuation])
    
    count=0
    for char in text:
        if char in string.punctuation:
            count+=1
    
    return round(count/(len(text) - text.count(" ")), 3)*100

data['punct%'] = data['text'].apply(lambda x: count_punct(x))

data.head()

In [None]:
plt.figure(figsize=(10, 5))
bins = np.linspace(0, 50, 40)
plt.hist(data[data['label']=='ham']['punct%'], bins, alpha=0.5, label='ham')
plt.hist(data[data['label']=='spam']['punct%'], bins, alpha=0.5, label='spam')
plt.legend(loc='upper right')

# Create feature for % of uppercase letters

In [None]:
def count_uppercase(text):  
    count=0
    for char in text:
        if char.isupper():
            count+=1
    
    return round(count/(len(text) - text.count(" ")), 3)*100

data['upper%'] = data['text'].apply(lambda x: count_uppercase(x))

data.head()

# Create feature count the exclamation marks

In [None]:
data['exclamation_marks'] = data['text'].apply(lambda x: x.count("!"))

data.head(10)

In [None]:
# data[data['Unnamed: 3'].notnull()]
# data[data['Unnamed: 4'].notnull()]

# Sentence Tokenization

In [None]:
from nltk.tokenize import sent_tokenize

data['text_clean'] = data['text'].apply(lambda x: sent_tokenize(x))

data.head()

# Remove punctuation

In [None]:
import string
string.punctuation

In [None]:
def remove_punct(text):
    #     text_nopunct = "".join([char for char in text if char not in string.punctuation])
    message_not_punc = []
    for char in text:
        if char not in string.punctuation:
            message_not_punc.append(char)
            
    text_nopunct = "".join(message_not_punc)
            
    return text_nopunct

data['text_clean'] = data['text'].apply(lambda x: remove_punct(x.lower()))

data.head()

# Word Tokenization

In [None]:
from nltk.tokenize import word_tokenize

data['text_clean'] = data['text_clean'].apply(lambda x: word_tokenize(x))

data.head()

# Stopwords

In [None]:
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))

def remove_stopwords(text):
    message = []
    
    for word in text:
        if word not in stop_words:
            message.append(word)
            
    return message

data['text_clean'] = data['text_clean'].apply(lambda x: remove_stopwords(x))

data.head()

# Lexicon Normalization

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer
wnl = WordNetLemmatizer()

def lemmatizing(text):
    message = []
    
    for word in text:
        message.append(wnl.lemmatize(word))
            
    return message

data['text_clean'] = data['text_clean'].apply(lambda x: lemmatizing(x))

data.head()

# Preparing the text

In [None]:
!pip install contractions

In [None]:
import contractions
import string
from nltk.corpus import wordnet
from nltk.tag import pos_tag

from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))

from nltk.stem.wordnet import WordNetLemmatizer
wnl = WordNetLemmatizer()

def get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def remove_punct(text):
    message=[]
    for word in text:
        message_not_punc = []
        
        if word not in stop_words:
            for char in word:
                if char not in string.punctuation:
                    message_not_punc.append(char)

            text_nopunct = "".join(message_not_punc)
            if text_nopunct!="":
                message.append(text_nopunct)
                
    return message

def lemmatizing(text):
    text=text.replace("/"," ")
    text=contractions.fix(text)
    text=word_tokenize(text)
    
    message = []
    
    for word in text:
        message.append(wnl.lemmatize(word, get_wordnet_pos(word)))
    
    message = remove_punct(message)
    message = " ".join(message)
    
    return message

data['text_clean'] = data['text'].apply(lambda x: lemmatizing(x.lower()))

data.head(20)

# CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(data['text_clean'])
print(X_counts.shape)
# print(count_vect.get_feature_names())

In [None]:
X_counts_df = pd.DataFrame(X_counts.toarray())
X_counts_df.columns = count_vect.get_feature_names()
X_counts_df

# CountVectorizer (w/ N-Grams)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

ngram_vect = CountVectorizer(ngram_range=(2,2))
X_counts = ngram_vect.fit_transform(data['text_clean'])
print(X_counts.shape)
# print(ngram_vect.get_feature_names())

In [None]:
X_counts_df = pd.DataFrame(X_counts.toarray())
X_counts_df.columns = ngram_vect.get_feature_names()
X_counts_df

# TfidfVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
X_tfidf = tfidf_vect.fit_transform(data['text_clean'])
print(X_tfidf.shape)
# print(tfidf_vect.get_feature_names())

In [None]:
X_tfidf_df = pd.DataFrame(X_tfidf.toarray())
X_tfidf_df.columns = tfidf_vect.get_feature_names()
X_tfidf_df

In [None]:
X_features = pd.concat([data['length'], data['punct%'], data['upper%'], data['exclamation_marks'], pd.DataFrame(X_tfidf.toarray())], axis=1)
X_features.head()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score

rf = RandomForestClassifier(n_jobs=-1)
k_fold = KFold(n_splits=5)
cross_val_score(rf, X_features, data['label'], cv=k_fold, scoring='accuracy', n_jobs=-1).mean()

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size=0.2)

In [None]:
rf = RandomForestClassifier(n_jobs=-1)
rf_model = rf.fit(X_train, y_train)

In [None]:
y_pred = rf_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred),3)))

In [None]:
# sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)[0:10]

feature_importance = rf_model.feature_importances_[:10]
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

fig = plt.figure(figsize=(17, 6))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, np.array(X_train.columns)[sorted_idx])
plt.title('Feature Importance')