In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        data = pd.read_csv(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

note: followed https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html to make this notebook

split data in input and label, train and validation

In [None]:
from sklearn.model_selection import train_test_split
data = data[["class","message"]]
#data.head(5)
X = data["message"]
Y = data["class"]
#Y = Y.replace({"spam":1,"ham":0})
X_train, X_test, y_train, y_test = train_test_split(X, Y,random_state=0)
X_train.head(5)
y_train.head(5)


process input messages

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer



X_pipe = Pipeline([
    #go from text to feature vector (includes detection of stopwords,...) with occurences
    ("cntvec",CountVectorizer()),
    #from count to term frequencies
    ("TfidTrans",TfidfTransformer(use_idf=False)),
    #use naive bayes classefier
    #('clf', MultinomialNB())
])

X_pipe.fit(X_train,y_train)
X_train_proc = X_pipe.transform(X_train)
X_test_proc = X_pipe.transform(X_test)



test different classifiers

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn import metrics

clasifiers = [
    ("naive bayes",MultinomialNB()),
    ("clf",SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=5, tol=None)),
    ("decision tree",DecisionTreeClassifier(max_depth=5)),
    ("Linear SVM",SVC(kernel="linear", C=0.025)),
    ("RBF SVM",SVC(gamma=2, C=1))
]

for name,classif in clasifiers:
    classif.fit(X_train_proc,y_train)
    y_test_pred = classif.predict(X_test_proc)
    
    print(name)
    print(metrics.classification_report(y_test,y_test_pred))
    print("---------------------------------------------------------------------")



Most classifiers correctly label all the "ham" messages (note that our dataset has around 6 times more "ham" then "Spam"), but the clf and the RBF SVM have better precission at labeling "spam". 
Since one of my sources recomends clf for text processing we continue with that.

now find the most obtimal parameters

In [None]:
from sklearn.model_selection import GridSearchCV

#at clasifier to pipeline
#X_pipe.steps.append(("clf",SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=5, tol=None)))
X_pipe = Pipeline([
    #go from text to feature vector (includes detection of stopwords,...) with occurences
    ("cntvec",CountVectorizer(ngram_range = (1,2))),
    #from count to term frequencies
    ("TfidTrans",TfidfTransformer(use_idf=False)),
    #classifier
    #("clf",SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=5, tol=None))
    ("RBF_SVM",SVC(gamma=2, C=1))
])


#grid of possible parameters
#names: stepname in pipeline__parameter name
parameters = {
    #'cntvec__ngram_range': [(1, 1), (1, 2)],
    'TfidTrans__use_idf': (True, False),
    #'clf__alpha': (1e-2, 1e-3),
    #'RBF_SVM__gamma': (1e-1,1e+1), #first try
    'RBF_SVM__gamma': (1,5,1e+1),
    #'RBF_SVM__C': (1e-2,1e+2) #first try
    'RBF_SVM__C': (1e-2,1e-1,1)
    }

#search object
gs_clf = GridSearchCV(X_pipe, parameters, cv=5, n_jobs=1)

gs_clf = gs_clf.fit(X_train, y_train)


for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

    
y_test_pred = gs_clf.predict(X_test)
print(metrics.classification_report(y_test,y_test_pred))


For clf
The parameters:
* use_idf = False
* alpha = 0.001
* and ngram_range = (1,2)

give the best results.
It labels only 2% more spam messages then our initial guess (81%)

for RBF SVM the following parameters were the best:
* C = 100
* gamma = 0.1
* use_idf = True
* ngram_range = (1,2)

The labeling of "spam" has improved with 15% (to 93%) outperforming the clf. However it introduces some (altough very few) false spam labels. 

A further zoom in the parameters might improve even more.

A second attemp for the SVM (with other parameter grid) gives the following as best:
* C = 1
* gamma = 1
* use_idf = False

This is much closer to the original parameters used when testing the different clasifiers and the results are also musch closer: an 7% increase in "spam" labels and no false "spam". Clearly the amount of false spam has a bigger effect on the accuracy the false "ham".
