In [1]:
# PRELIMINARIES

In [2]:
'''

link: https://www.kaggle.com/uciml/sms-spam-collection-dataset/home

dataset description: The SMS Spam Collection is a set of SMS 
tagged messages that have been collected for SMS Spam research.
It contains one set of SMS messages in English of 5,574 messages,
tagged acording being ham (legitimate) or spam.

The files contain one message per line. 
Each line is composed by two columns: 
v1 contains the label (ham or spam) 
and v2 contains the raw text.

'''

'\n\nlink: https://www.kaggle.com/uciml/sms-spam-collection-dataset/home\n\ndataset description: The SMS Spam Collection is a set of SMS \ntagged messages that have been collected for SMS Spam research.\nIt contains one set of SMS messages in English of 5,574 messages,\ntagged acording being ham (legitimate) or spam.\n\nThe files contain one message per line. \nEach line is composed by two columns: \nv1 contains the label (ham or spam) \nand v2 contains the raw text.\n\n'

In [3]:
import numpy as np, pandas as pd
raw_data = pd.read_csv('../input/spam.csv', encoding = "ISO-8859-1")

In [4]:
raw_data['response'] = 0
raw_data.loc[raw_data['v1'] == 'spam', 'response'] = 1
data = raw_data[['response', 'v2']]
data.columns = ['response', 'text']             

In [5]:
# EXPLORATORY DATA ANALYSIS

In [6]:
pd.set_option('display.max_colwidth', -1)
data.head()

Unnamed: 0,response,text
0,0,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives around here though"


In [7]:
# Event Rate - how many spam and how many ham? 
data.response.value_counts()

0    4825
1    747 
Name: response, dtype: int64

In [8]:
# Check for Nulls
data.isnull().sum()

response    0
text        0
dtype: int64

In [9]:
# Check Data Types

In [10]:
data['text'].astype('str')
data.dtypes

response    int64 
text        object
dtype: object

In [11]:
# FEATURE ENGINEERING

In [12]:
# length of text
data['text_len'] = data['text'].str.len()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [13]:
# number of words
data['text_tokens'] = data['text'].apply(lambda x: len(str(x).split(" ")))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [14]:
# average word length
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

data['text_avg_word_len'] = data['text'].apply(lambda x: avg_word(str(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [15]:
# number of stop words/fillers (a, an, the...)
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop = stopwords.words('english')
data['text_stop_words'] = data['text'].apply(lambda x: len([x for x in str(x).split() if str(x) in stop]))

In [16]:
# number of 'spamy'/suspicious words
data['text_keywords'] = data['text'].apply(lambda x: len([x for x in x.split() if x.lower() in ('free', 'win', 'won', 'exclusive', 'enroll', 'discount', 'prize', 'million')]))

In [17]:
# number of numeric characters
data['text_numerics'] = data['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

In [18]:
# number of titled words
data['text_titles'] = data['text'].apply(lambda x: len([x for x in x.split() if x.istitle()]))

In [19]:
# TRAIN-TEST DATA SPLIT

In [20]:
data_clean = data.drop(['text'], axis = 1)
data_clean = data_clean.dropna()
print(data_clean.dtypes)
y = data_clean['response'].astype('int')
X = data_clean.drop('response', axis = 1)

response             int64  
text_len             int64  
text_tokens          int64  
text_avg_word_len    float64
text_stop_words      int64  
text_keywords        int64  
text_numerics        int64  
text_titles          int64  
dtype: object


In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state=19)



In [22]:
from catboost import Pool, CatBoostClassifier
cat_feature_index = np.where(X.dtypes == 'object')[0]
train_pool = Pool(X_train, y_train, cat_features = cat_feature_index)
test_pool = Pool(X_test, y_test, cat_features = cat_feature_index)

In [23]:
from catboost import Pool, CatBoostClassifier

cat_model = CatBoostClassifier(
    depth = 6,
    random_seed = 3, 
    learning_rate = 0.1, 
    eval_metric = 'AUC',
    #iterations = 500,
    verbose = True,
    loss_function= 'Logloss',
    od_type='Iter', # overfitting detector - by iterations
    od_wait=50 # prevent overfitting by ending training after 1 rounds without best value
     )

cat_model.fit(
    train_pool,
    eval_set = test_pool, 
    use_best_model = True
    )

0:	test: 0.9676975	best: 0.9676975 (0)	total: 65.7ms	remaining: 1m 5s
1:	test: 0.9670406	best: 0.9676975 (0)	total: 81.1ms	remaining: 40.5s
2:	test: 0.9662542	best: 0.9676975 (0)	total: 97.3ms	remaining: 32.4s
3:	test: 0.9672648	best: 0.9676975 (0)	total: 113ms	remaining: 28.1s
4:	test: 0.9723812	best: 0.9723812 (4)	total: 128ms	remaining: 25.4s
5:	test: 0.9727191	best: 0.9727191 (5)	total: 142ms	remaining: 23.6s
6:	test: 0.9766732	best: 0.9766732 (6)	total: 157ms	remaining: 22.3s
7:	test: 0.9787419	best: 0.9787419 (7)	total: 172ms	remaining: 21.3s
8:	test: 0.9797462	best: 0.9797462 (8)	total: 187ms	remaining: 20.6s
9:	test: 0.9794809	best: 0.9797462 (8)	total: 200ms	remaining: 19.8s
10:	test: 0.9798315	best: 0.9798315 (10)	total: 215ms	remaining: 19.4s
11:	test: 0.9798662	best: 0.9798662 (11)	total: 231ms	remaining: 19s
12:	test: 0.9801726	best: 0.9801726 (12)	total: 245ms	remaining: 18.6s
13:	test: 0.9812179	best: 0.9812179 (13)	total: 260ms	remaining: 18.3s
14:	test: 0.9811137	best:

<catboost.core.CatBoostClassifier at 0x7fe01e0b55f8>

In [24]:
#CAT FEATURE IMPORTANCE

feature_importance = cat_model.get_feature_importance(train_pool)
feature_names = X_train.columns
feature_imp = pd.DataFrame([feature_names, feature_importance])
final = feature_imp.transpose()
final.sort_values(by = 1, ascending = False, inplace = True)
pd.set_option('display.max_colwidth', -1)
final.head(10)

Unnamed: 0,0,1
2,text_avg_word_len,34.6084
0,text_len,19.3112
5,text_numerics,16.4251
1,text_tokens,12.9016
4,text_keywords,7.45372
3,text_stop_words,5.48134
6,text_titles,3.81869


In [25]:
# CAT MODEL EVALUATION

# CAT PREDICTIONS
cat_predictions_probs = cat_model.predict_proba(test_pool)
cat_predictions = np.where(cat_predictions_probs[:,1] > 0.5, 1, 0)
print(cat_predictions[:5]) # predicted class
print(cat_predictions_probs[:5]) # probability scores

print('CAT MODEL EVALUATION')
print(y.describe())
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc
print('\nAccuracy: ', str(accuracy_score(y_test, cat_predictions)))
print('Precision: ', str(precision_score(y_test, cat_predictions)))
print('Recall: ', str(recall_score(y_test, cat_predictions)))
print('F1: ', str(f1_score(y_test, cat_predictions)))
print('Area under ROC Curve: ', str(roc_auc_score(y_test, cat_predictions_probs[:,1])))
print('GINI: ', str(-1 + 2*roc_auc_score(y_test, cat_predictions_probs[:,1])))

tn, fp, fn, tp = confusion_matrix(y_test, cat_predictions).ravel()

print('True Negatives: ', str(tn))
print('True Positives: ', str(tp))
print('False Negatives: ', str(fn))
print('False Positives: ', str(fp))

print('\nTotal SMS: ', str(tn+fp+fn+tp))
print('No. of SMS the Model Declares as Spam: ', str(fp+tp))
print('No. of SMS that were actually SPAM: ', str(tp+fn))
print('No. of Spam SMS caught by Model: ', str(tp))

print('\nProportion of SMS Declared as Spam: ', str((fp+tp)/(tn+fp+fn+tp)))
print('Proportion of Spam SMS Caught by Model: ', str(tp/(tp+fn)))

[0 0 0 1 0]
[[0.91190116 0.08809884]
 [0.95829433 0.04170567]
 [0.99117778 0.00882222]
 [0.00241838 0.99758162]
 [0.99780588 0.00219412]]
CAT MODEL EVALUATION
count    5572.000000
mean     0.134063   
std      0.340751   
min      0.000000   
25%      0.000000   
50%      0.000000   
75%      0.000000   
max      1.000000   
Name: response, dtype: float64

Accuracy:  0.9623318385650225
Precision:  0.9562043795620438
Recall:  0.7844311377245509
F1:  0.8618421052631579
Area under ROC Curve:  0.9844993557189419
GINI:  0.9689987114378837
True Negatives:  942
True Positives:  131
False Negatives:  36
False Positives:  6

Total SMS:  1115
No. of SMS the Model Declares as Spam:  137
No. of SMS that were actually SPAM:  167
No. of Spam SMS caught by Model:  131

Proportion of SMS Declared as Spam:  0.12286995515695068
Proportion of Spam SMS Caught by Model:  0.7844311377245509
