In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn import feature_extraction, model_selection, naive_bayes, metrics, svm
from IPython.display import Image
import warnings
from collections import namedtuple
import gzip
import logging
from importlib import reload
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%H:%M:%S')
import base64
warnings.filterwarnings("ignore")
%matplotlib inline  

In [30]:
DocItem = namedtuple('DocItem', ['doc_id', 'is_spam', 'url', 'words'])

def trace(items_num, trace_num=1000):
    if items_num % trace_num == 0: logging.info("Complete items %05d" % items_num)

def load_csv(input_file_name):    
    
    with gzip.open(input_file_name) if input_file_name.endswith('gz') else open(input_file_name)  as input_file:            
        headers = input_file.readline()
        
        for i, line in enumerate(input_file):
            trace(i)
            line = line.decode()
            parts = line.strip().split('\t')
            url_id = int(parts[0])                                        
            mark = bool(int(parts[1]))                    
            url = parts[2]
            pageInb64 = parts[3]
            html_data = base64.b64decode(pageInb64).decode("utf-8", errors="ignore")
            yield DocItem(url_id, mark, url, html_data)            
                
        trace(i, 1)

In [31]:
TRAIN_DATA_FILE = 'kaggle_train_data_tab_new.csv.gz'
train_docs = list(load_csv(TRAIN_DATA_FILE))

20:00:30 INFO:Complete items 00000
20:00:32 INFO:Complete items 01000
20:00:33 INFO:Complete items 02000
20:00:34 INFO:Complete items 03000
20:00:35 INFO:Complete items 04000
20:00:36 INFO:Complete items 05000
20:00:36 INFO:Complete items 06000
20:00:38 INFO:Complete items 07000
20:00:38 INFO:Complete items 07043


In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

In [82]:
train_data = pd.DataFrame(train_docs)
train_data.head()
X_train = train_data['words']
y_train = train_data['is_spam'].replace({True: 1, False: 0})

In [83]:
tf = TfidfVectorizer(ngram_range=(2,2), min_df=0.01, sublinear_tf=True)
X_train = tf.fit_transform(X_train)
X_train.shape, y_train.shape

((7044, 13692), (7044,))

In [85]:
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import ExtraTreesClassifier

# m = SGDClassifier(verbose=True)
# scores = cross_val_score(m, X_train, y_train, cv=5)
# print (scores)

models = [MLPClassifier, ExtraTreesClassifier, SGDClassifier]
f1_scores = []
trained_models = []
for model in models:
    m = model(verbose=True)
    m.fit(X_train, y_train)
#     output = m.predict(X_test)
#     f1 = f1_score(output, y_test)
#     print(f'F1 : {f1:.4f}%')

#     f1_scores.append(f1)
    trained_models.append(m)

Iteration 1, loss = 0.40012305
Iteration 2, loss = 0.12983386
Iteration 3, loss = 0.07720951
Iteration 4, loss = 0.05596109
Iteration 5, loss = 0.04313051
Iteration 6, loss = 0.03354300
Iteration 7, loss = 0.02678482
Iteration 8, loss = 0.02146630
Iteration 9, loss = 0.01710335
Iteration 10, loss = 0.01398323
Iteration 11, loss = 0.01176881
Iteration 12, loss = 0.00997675
Iteration 13, loss = 0.00852785
Iteration 14, loss = 0.00742534
Iteration 15, loss = 0.00675678
Iteration 16, loss = 0.00573321
Iteration 17, loss = 0.00505746
Iteration 18, loss = 0.00457808
Iteration 19, loss = 0.00418360
Iteration 20, loss = 0.00383641
Iteration 21, loss = 0.00355549
Iteration 22, loss = 0.00333317
Iteration 23, loss = 0.00314086
Iteration 24, loss = 0.00299098
Iteration 25, loss = 0.00281719
Iteration 26, loss = 0.00270297
Iteration 27, loss = 0.00260466
Iteration 28, loss = 0.00251223
Iteration 29, loss = 0.00240278
Iteration 30, loss = 0.00233341
Iteration 31, loss = 0.00226755
Iteration 32, los

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


F1 : 0.9775%
-- Epoch 1
Norm: 35.91, NNZs: 13302, Bias: 0.415871, T: 5635, Avg. loss: 0.090123
Total training time: 0.01 seconds.
-- Epoch 2
Norm: 30.11, NNZs: 13411, Bias: 0.379324, T: 11270, Avg. loss: 0.033435
Total training time: 0.01 seconds.
-- Epoch 3
Norm: 27.85, NNZs: 13560, Bias: 0.359834, T: 16905, Avg. loss: 0.025530
Total training time: 0.02 seconds.
-- Epoch 4
Norm: 26.66, NNZs: 13581, Bias: 0.336854, T: 22540, Avg. loss: 0.021999
Total training time: 0.03 seconds.
-- Epoch 5
Norm: 26.23, NNZs: 13595, Bias: 0.333231, T: 28175, Avg. loss: 0.019985
Total training time: 0.03 seconds.
F1 : 0.9875%


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [79]:
best_model1 = trained_models[0]
best_model2 = trained_models[2]

In [80]:
TEST_DATA_FILE = 'kaggle_test_data_tab_new.csv.gz'
test_docs = list(load_csv(TEST_DATA_FILE))

21:44:48 INFO:Complete items 00000
21:44:51 INFO:Complete items 01000
21:44:54 INFO:Complete items 02000
21:44:55 INFO:Complete items 03000
21:44:57 INFO:Complete items 04000
21:44:59 INFO:Complete items 05000
21:45:00 INFO:Complete items 06000
21:45:02 INFO:Complete items 07000
21:45:03 INFO:Complete items 08000
21:45:05 INFO:Complete items 09000
21:45:07 INFO:Complete items 10000
21:45:08 INFO:Complete items 11000
21:45:09 INFO:Complete items 12000
21:45:10 INFO:Complete items 13000
21:45:12 INFO:Complete items 14000
21:45:13 INFO:Complete items 15000
21:45:14 INFO:Complete items 16000
21:45:14 INFO:Complete items 16038


In [81]:
from tqdm import tqdm
import csv
with open('my_submission.csv' , 'w') as fout:
    writer = csv.writer(fout)
    writer.writerow(['Id','Prediction'])
    for item in tqdm(test_docs):
        line = item.words
        line = tf.transform([line])
        prediction = best_model1.predict(line)[0]
        writer.writerow([item[0], prediction])
        
# with open('my_submission2.csv' , 'w') as fout:
#     writer = csv.writer(fout)
#     writer.writerow(['Id','Prediction'])
#     for item in tqdm(test_docs):
#         line = item.words
#         line = tf.transform([line])
#         prediction = best_model2.predict(line)[0]
#         writer.writerow([item[0], prediction])

100%|██████████| 16039/16039 [03:19<00:00, 79.34it/s] 
100%|██████████| 16039/16039 [02:51<00:00, 93.31it/s] 
