## **Context**

The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged acording being ham (legitimate) or spam.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#Read data
data = pd.read_csv('../input/sms-spam-collection-dataset/spam.csv',
                   encoding='ISO-8859-1', 
                   usecols=['v1', 'v2'])
data.rename(columns={'v1':'labels', 'v2':'content'}, inplace=True)

print("data shape: ", data.shape)
display(data.head())

### EDA

In [None]:
#distribution of the labels to check for imbalance
sns.countplot(data.labels);
data['labels'].value_counts(normalize=True)

We have an imbalanced dataset.

In [None]:
#Example for a ham content
ham_indices = data[data.labels == 'ham'].index
random_ham = np.random.choice(ham_indices)

data.iloc[random_ham, :].content

In [None]:
#Example for a spam content
spam_indices = data[data.labels == 'spam'].index
random_ham = np.random.choice(spam_indices)

data.iloc[random_ham, :].content

**Wordcount plots**

In [None]:
f = plt.figure(figsize=(8,12))
wordcloud = WordCloud(max_words=100).\
            generate(' '.join(data.loc[data['labels'] == 'ham', 'content'].to_list()))
plt.imshow(wordcloud)
plt.title('ham words');

In [None]:
f = plt.figure(figsize=(8,12))
wordcloud = WordCloud(max_words=100).\
            generate(' '.join(data.loc[data['labels'] == 'spam', 'content'].to_list()))
plt.imshow(wordcloud)
plt.title('spam words');

**top 30 words for spam**

In [None]:
from collections import Counter

spam_text = ' '.join(data.iloc[spam_indices]['content'])
spam_list = spam_text.lower().split()

cnt = Counter()

for word in spam_list:
    if word not in stopwords.words('english'):
        cnt[word] += 1
        
cnt.most_common(30)

**Top 30 words for ham**

In [None]:
ham_text = ' '.join(data.iloc[ham_indices]['content'])
ham_list = ham_text.lower().split()

cnt = Counter()

for word in ham_list:
    if word not in stopwords.words('english'):
        cnt[word] += 1
        
cnt.most_common(30)

### Preprocessing

Removing too small sentences or too large ones

In [None]:
word_count = data['content'].apply(lambda s: len(s.split()))
word_count.plot.hist(bins=100);

In [None]:
word_count[(word_count < word_count.quantile(0.98)) & (word_count > word_count.quantile(0.02))].plot.hist(bins=100);

In [None]:
data = data[(word_count < word_count.quantile(0.98)) & (word_count > word_count.quantile(0.02))]

In [None]:
print(data.shape)

Removed a bit over 200 sentences


**TFIDF**

A method that will help us convert text data to numerical data. This method assigns weights for each document term, taking into consideration the frequency of a term in a document and the frequency of a term across all documents.

In [None]:
cv = TfidfVectorizer(max_features=5000, stop_words='english')
sparse_mat = cv.fit_transform(data['content'])

In [None]:
X = sparse_mat.toarray()
print(X.shape)

In [None]:
#print some of the features
print(cv.get_feature_names()[1000:1005])

**splitting data**

In [None]:
#split data
from sklearn.model_selection import train_test_split
y = data['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

validation

In [None]:
print(X_train.shape)
print(y_train.shape)
print(y_test.shape)
print(X_test.shape)

### Boosting

A procedure that combines the outputs of many "weak classifiers(or tree stumps) to produce a powerful committee. 

In [None]:
#fitting the model
ada_model = AdaBoostClassifier( n_estimators=100)
ada_model.fit(X_train, y_train)

ada_preds = ada_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

#printing metrics
print("Accuracy score: ", accuracy_score(y_test, ada_preds))
print("Confusion matrix: \n",confusion_matrix(y_test, ada_preds))
print("Classification report: \n",classification_report(y_test, ada_preds))

**top 10 important word/features in our Adaboost model**

In [None]:
feat_importance_index = np.argsort(ada_model.feature_importances_)[-10:]

print("10 top important words: \n", pd.Series(cv.get_feature_names())[feat_importance_index])

**Plotting top 3 tree stumps(weak classifiers)**

In [None]:
top_3 = np.argsort(ada_model.estimator_weights_)[-3:]

In [None]:
plot_tree(ada_model.estimators_[top_3[0]], class_names=['ham', 'spam'], proportion=True,
               rounded=True, filled=True, feature_names=cv.get_feature_names());

In [None]:
plot_tree(ada_model.estimators_[top_3[1]], class_names=['ham', 'spam'], proportion=True,
               rounded=True, filled=True, feature_names=cv.get_feature_names());

In [None]:
plot_tree(ada_model.estimators_[top_3[2]], class_names=['ham', 'spam'], proportion=True,
               rounded=True, filled=True, feature_names=cv.get_feature_names());

### Bagging

In [None]:
#Fitting the model
model_rf = RandomForestClassifier(n_estimators=100)
model_rf = model_rf.fit(X_train, y_train)
#Predicting
y_rf_pred = model_rf.predict(X_test)

In [None]:
#print metrics
print('accuracy_score: ', accuracy_score(y_test, y_rf_pred))
print('confusion matrix: \n', confusion_matrix(y_test, y_rf_pred))
print('report: \n', classification_report(y_test, y_rf_pred))

**top 10 important word/features in our Random Forest model**

In [None]:
feat_importance_index = np.argsort(model_rf.feature_importances_)[-10:]

print("10 top important words: \n", pd.Series(cv.get_feature_names())[feat_importance_index])

**Plotting the first decision tree in the RF model.**

In [None]:
f = plt.figure(figsize=(35,15))
plot_tree(model_rf.estimators_[0], class_names=['ham', 'spam'], proportion=True,
               rounded=True, filled=True, max_depth=5, 
               feature_names=cv.get_feature_names());

### Pipelines

The advanced way

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['content'], data['labels'], stratify=data['labels'])

Adaboost

In [None]:
pipeline = Pipeline(steps=[
                ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
                ('model', AdaBoostClassifier(n_estimators=100))])
_ = pipeline.fit(X_train, y_train)

preds = pipeline.predict(X_test)
print(classification_report(y_test, preds))

Random Forest

In [None]:
pipeline = Pipeline(steps=[
                ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
                ('model', RandomForestClassifier(n_estimators=100))])
_ = pipeline.fit(X_train, y_train)

preds = pipeline.predict(X_test)
print(classification_report(y_test, preds))

**I will only continue with Random Forest.** 

1) I will to use TruncatedSVD to reduce the dimensions. TSVD is good for sparse data whereas PCA is good for dense data.

2) I will try oversampling methods since our data is imbalanced. These methods may improve the f1-score and the recall. I will use SMOTE and RandomOverSampler

Part 1 - 

In [None]:
from sklearn.decomposition import TruncatedSVD

pipeline = Pipeline(steps=[
                ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
                ('dim_reduction', TruncatedSVD(n_components=100)),
                ('model', RandomForestClassifier(n_estimators=100))])
_ = pipeline.fit(X_train, y_train)

preds = pipeline.predict(X_test)
print(classification_report(y_test, preds))

Part 2- 

over-sampling is simply a process of repeating some samples of the minority class and balance the number of samples between classes in the dataset.

In [None]:
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler

In [None]:
pipeline = make_pipeline(
                    TfidfVectorizer(max_features=5000, stop_words='english'),
                    RandomOverSampler(),
                    RandomForestClassifier(n_estimators=100))

_ = pipeline.fit(X_train, y_train)

preds = pipeline.predict(X_test)
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

We can see that we improved the recall for our minority class(spam) and the f1-score.

**accruacy score is not the right metric when we have imbalance in the data**