In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('datasets/05/spam.csv', encoding='ISO-8859-1')

In [3]:
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


# 2 Feature engineering

In [4]:
df = df.fillna('')
df['text'] = df['v2'] + df['Unnamed: 2'] + df['Unnamed: 3'] + df['Unnamed: 4']
df['label'] = df['v1'] == 'spam'
df = df.drop(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [5]:
df

Unnamed: 0,text,label
0,"Go until jurong point, crazy.. Available only ...",False
1,Ok lar... Joking wif u oni...,False
2,Free entry in 2 a wkly comp to win FA Cup fina...,True
3,U dun say so early hor... U c already then say...,False
4,"Nah I don't think he goes to usf, he lives aro...",False
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,True
5568,Will Ì_ b going to esplanade fr home?,False
5569,"Pity, * was in mood for that. So...any other s...",False
5570,The guy did some bitching but I acted like i'd...,False


In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer


In [13]:
X = df['text']
y = df['label']

In [16]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: text, Length: 5572, dtype: object

In [9]:
text_processor = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer())
])

In [18]:
def get_text_length(x):
    return np.array([len(t) for t in x]).reshape(-1, 1)

In [26]:
full_processor = Pipeline([
    ('features', FeatureUnion([
        ('text', text_processor),
        ('length', Pipeline([
            ('count', FunctionTransformer(get_text_length, validate=False)),
        ]))
    ]))
])

In [27]:
X_processed = full_processor.fit_transform(X)

In [28]:
X_processed

<5572x8765 sparse matrix of type '<class 'numpy.float64'>'
	with 79907 stored elements in Compressed Sparse Row format>

<hr>

# 3 Modeling

In [24]:
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, stratify=y, random_state=1)

In [29]:
def run_exps(X_train: pd.DataFrame , y_train: pd.DataFrame, models: list):
    best_f1 = 0
    best_f1_model = ''
    best_acc = 0
    for name, model in models:
        print("-----")
        print(f"Model {name}")
        f1 = cross_val_score(model, X_train, y_train, cv=5, scoring='f1').mean()*100
        acc = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()*100
        if f1 > best_f1:
            best_f1 = f1
            best_f1_model = name
        if acc > best_acc:
            best_acc = acc
            best_acc_model = name
        print(f"F1: {f1:.1f}")
        print(f"Accuracy: {acc:.1f}")
    
    print('-----')
    print("Best models:")
    print("By F1 score:", best_f1_model, '{:.1f}'.format(best_f1))
    print("By accuracy: ", best_acc_model, '{:.1f}'.format(best_acc))

In [30]:
models = [
    ('LogReg', LogisticRegression()), 
    ('RF', RandomForestClassifier()),
    ('SVM', SVC(probability=True)), 
    ('XGB', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))
]

In [31]:
run_exps(X_train, y_train, models)

-----
Model LogReg


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

F1: 84.0
Accuracy: 96.2
-----
Model RF
F1: 89.6
Accuracy: 97.5
-----
Model SVM
F1: 0.0
Accuracy: 86.6
-----
Model XGB
F1: 91.2
Accuracy: 97.8
-----
Best models:
By F1 score: XGB 91.2
By accuracy:  XGB 97.8


Nejlepsi model je XGB, proto dale pokracujeme s nim.

<hr>

# 4 Model tuning

In [32]:
from sklearn.model_selection import GridSearchCV

In [39]:
pipe = Pipeline([
    ('features', FeatureUnion([
        ('text', text_processor),
        ('length', Pipeline([
            ('count', FunctionTransformer(get_text_length, validate=False)),
        ]))
    ])),
    ('clf', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))
])

In [47]:
text_processor

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('tfidf', TfidfTransformer())])

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1)

## 4.1 Parametry transformeru

In [48]:
# sorted(pipe.get_params().keys())

In [44]:
parameters = {
    'features__text__vectorizer__stop_words':['english', None],
    'features__text__vectorizer__ngram_range':[(1,1), (1,3)],
    'features__text__tfidf__use_idf':[True, False]
},

grid = GridSearchCV(pipe, parameters, cv=5, scoring='f1', return_train_score=True, verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('features',
                                        FeatureUnion(transformer_list=[('text',
                                                                        Pipeline(steps=[('vectorizer',
                                                                                         CountVectorizer()),
                                                                                        ('tfidf',
                                                                                         TfidfTransformer())])),
                                                                       ('length',
                                                                        Pipeline(steps=[('count',
                                                                                         FunctionTransformer(func=<function get_text_length at 0x00000128213FECA0>))]))])),
                                       ('clf',
                           

In [45]:
grid.best_params_

{'features__text__tfidf__use_idf': False,
 'features__text__vectorizer__ngram_range': (1, 1),
 'features__text__vectorizer__stop_words': None}

In [55]:
print('{:.1f}'.format(grid.best_score_*100))

91.3


Tuningem parametru transformeru jsme dosahli zlepseni pouze o 0.1 procentniho bodu

## 4.2 Hyperparametry modelu

In [57]:
parameters = {
    'features__text__tfidf__use_idf': [False],
    'features__text__vectorizer__ngram_range': [(1, 1)],
    'features__text__vectorizer__stop_words': [None],
    
    'clf__n_estimators':[100, 300],
    'clf__booster':['gbtree', 'gblinear']
},

grid = GridSearchCV(pipe, parameters, cv=5, scoring='f1', return_train_score=True, verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('features',
                                        FeatureUnion(transformer_list=[('text',
                                                                        Pipeline(steps=[('vectorizer',
                                                                                         CountVectorizer()),
                                                                                        ('tfidf',
                                                                                         TfidfTransformer())])),
                                                                       ('length',
                                                                        Pipeline(steps=[('count',
                                                                                         FunctionTransformer(func=<function get_text_length at 0x00000128213FECA0>))]))])),
                                       ('clf',
                           

In [59]:
grid.best_params_

{'clf__booster': 'gblinear',
 'clf__n_estimators': 100,
 'features__text__tfidf__use_idf': False,
 'features__text__vectorizer__ngram_range': (1, 1),
 'features__text__vectorizer__stop_words': None}

In [60]:
print('{:.1f}'.format(grid.best_score_*100))

92.9


Tuningem hyperparametru modelu jsme dosahli zlepseni o dalsich 0.6 procentnich bodu