In [1]:
import os
from pathlib import Path
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
import gc
from tqdm import tqdm
tqdm.pandas()
import time
import re
import string

from nltk.tokenize import wordpunct_tokenize, sent_tokenize
import nltk

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import sklearn.datasets
import sklearn.metrics
import sklearn.model_selection
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold


In [2]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
#sub = pd.read_csv(Path('../input/sample_submission.csv'))

In [3]:
def extract_features(text):
    bag_of_words = [x for x in wordpunct_tokenize(text)]

    features = []
    #**************************************************************************
    ## Countable/statistical features
    #**************************************************************************
    # Example feature 1: count the number of words
    num_words = len(bag_of_words)
    features.append(num_words)
    
    # words without vowels
    features.append(len([word for word in bag_of_words if not re.search('[aeiou]', word.lower(), re.I)]))
    
    # number of characters including whitespace
    features.append(len(text))

    # number of characters without whitespace
    features.append(len(text.replace(" ", "")))

    # punctuation count
    count = lambda l1,l2: sum([1 for x in l1 if x in l2])
    features.append(count(text, set(string.punctuation)))

    # number of numbers
    features.append(sum(c.isdigit() for c in text))

    # number of alpha chars
    features.append(sum(c.isalpha() for c in text))

    # number of spaces
    features.append(sum(c.isspace() for c in text))
    
    # Commas per sentence
    features.append(bag_of_words.count(','))    # TODO alterative text.count(",") <- see what is faster

    # Semicolons per sentence
    features.append(bag_of_words.count(';'))

    # Two/three continuous punctuation count
    features.append(len(re.findall('(\!|\?){2,}', text)))

    # number of all caps words
    features.append(sum(1 for word in bag_of_words if word.isupper()))
    
    # number of selfe reference
    features.append(bag_of_words.count("I") + bag_of_words.count("me") + bag_of_words.count("myslef") + bag_of_words.count("my") / num_words)

    # number of small letters 'i' instead of 'I'
    features.append(bag_of_words.count("i"))

    # number of sentences that have no space after a full stop
    features.append(len(re.findall('((\.|\?|\!|\:)\w+)', text)))

    # number of questions
    features.append(len(re.findall('\?', text)))

    # number of exclamation marks
    features.append(len(re.findall('\!', text)))
    
    # question starts with number
    features.append(1 if text[0].isdigit() else 0)
    
    # number of he
    features.append(bag_of_words.count("he") + bag_of_words.count("He"))

    # number of she
    features.append(bag_of_words.count("she") + bag_of_words.count("She"))

    # number of he/she
    features.append(bag_of_words.count("he/she") + bag_of_words.count("He/she"))
    
    #**************************************************************************
    # POS based features
    #**************************************************************************
    pos_tags = [pos_tag[1] for pos_tag in nltk.pos_tag(bag_of_words)]
    # count frequencies for common POS types
    pos_list = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS',
                'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$',
                'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN',
                'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']

    counted_pos = []
    for part_of_speech in pos_list:
        counted_pos.append(pos_tags.count(part_of_speech))

    [features.append(i) for i in counted_pos]
    
    # Blacklisted words
    insincere_words = ['penis', 'dick', 'gay', 'fuck', 'sex', 'suck', 'bisexual', 'idiot', 'moron', 'hoe', 'bitch', 'trump', 'putin']
    lower_bag = [word.lower() for word in bag_of_words]
    counted_rude = []
    for rude in insincere_words:
        counted_rude.append(lower_bag.count(rude))

    features.extend(counted_rude)

    return features

In [4]:
# define the training and evaluation loop
def train_and_evaluate__model(model, data_train, labels_train, data_valid, labels_valid, data_test, labels_test):
    start = time.time()
    model.fit(data_train, labels_train)
    end = time.time()
    print('Training time: ', end - start)
    pred_val = model.predict(data_valid)
    #pred_test = model.predict(data_test)
    
    recall = sklearn.metrics.recall_score(labels_valid, pred_val)
    print("Recall validation: %f" % recall)
    #recall = sklearn.metrics.recall_score(labels_test, pred_test)
    #print("Recall test: %f" % recall)

    precision = sklearn.metrics.precision_score(labels_valid, pred_val)
    print("Precision validation: %f" % precision)
    #precision = sklearn.metrics.precision_score(labels_test, pred_test)
    #print("Precision test: %f" % precision)

    f1_score = sklearn.metrics.f1_score(labels_valid, pred_val)
    print("F1-score validation: %f" % f1_score)
    #f1_score = sklearn.metrics.f1_score(labels_test, pred_test)
    #print("F1-score test: %f" % f1_score)
    return f1_score

In [5]:
# split the dataset into training and validation datasets 
print("Train/test split")
train_x, test_x, train_y, test_y = model_selection.train_test_split(train['question_text'], train['target'], test_size=0.1, random_state=4)

Train/test split


In [6]:
# extract the features
train_features = list(map(extract_features, tqdm(list(train_x))))
test_features = list(map(extract_features, tqdm(list(test_x))))

100%|███████████████████████████████████████████████████████████████████████| 1175509/1175509 [30:08<00:00, 650.10it/s]
100%|█████████████████████████████████████████████████████████████████████████| 130613/130613 [03:19<00:00, 655.87it/s]


In [7]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC

# create the classifier
classifier = naive_bayes.MultinomialNB()
#classifier = ensemble.RandomForestClassifier(max_depth=5, n_estimators=50, max_features='auto')

# Fit to data and predict using pipelined scaling, GNB and PCA.
#classifier = make_pipeline(StandardScaler(), PCA(n_components='mle'), naive_bayes.MultinomialNB())

In [8]:
# characters level tf-idf
#tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
#tfidf_vect_ngram_chars.fit(list(train_x))
#xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
#xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_x) 
#xtest_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test['question_text'])

#for i, row in enumerate(xtrain_tfidf_ngram_chars):
#    train_features[i].append(row)
    
#for i, row in enumerate(xvalid_tfidf_ngram_chars):
#    test_features[i].append(row)
    

In [9]:
from imblearn.over_sampling import RandomOverSampler, SMOTE

n_folds = 10
data = np.asarray(train_features)
labels = np.asarray(train_y)
skf = StratifiedKFold(n_splits=n_folds, shuffle=True)
f1_scores = []
for i, (train, valid) in enumerate(skf.split(data, labels)):
    print("Running Fold", i+1, "/", n_folds)
    # oversampling of insincere questions
    # ros = RandomOverSampler(random_state=1)
    # train_data, train_labels = ros.fit_resample(data[train], labels[train])
    train_data = data[train]
    train_labels = labels[train]
    
    validation_data = data[valid]
    validation_labels = labels[valid]
    
    # K.clear_session()
    # model = None # Clearing the NN.
    model = classifier
    f1_score = train_and_evaluate__model(model, train_data, train_labels, validation_data, validation_labels, test_features, np.asarray(test_y))
    f1_scores.append(f1_score)
    del model; gc.collect()

print('Mean F1 score is {} SD is {}'.format(np.mean(f1_score), np.std(f1_score)))


Running Fold 1 / 10
Training time:  0.7169930934906006
Recall validation: 0.267850
Precision validation: 0.412850
F1-score validation: 0.324906
Running Fold 2 / 10
Training time:  0.7339897155761719
Recall validation: 0.267437
Precision validation: 0.412827
F1-score validation: 0.324595
Running Fold 3 / 10
Training time:  0.7010154724121094
Recall validation: 0.268262
Precision validation: 0.413486
F1-score validation: 0.325407
Running Fold 4 / 10
Training time:  0.7459926605224609
Recall validation: 0.275554
Precision validation: 0.419213
F1-score validation: 0.332531
Running Fold 5 / 10
Training time:  0.705939531326294
Recall validation: 0.257257
Precision validation: 0.397281
F1-score validation: 0.312291
Running Fold 6 / 10
Training time:  0.7189927101135254
Recall validation: 0.260146
Precision validation: 0.402597
F1-score validation: 0.316062
Running Fold 7 / 10
Training time:  0.7199928760528564
Recall validation: 0.270739
Precision validation: 0.410513
F1-score validation: 0.