In [1]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

Using TensorFlow backend.


## Preprocessing

In [2]:
from hazm import *

def find_puncs(tokens):
    puncs = set()
    for token in tokens:
        if len(token) < 2 and not token.isdigit() and not token.isalpha():
                puncs.add(token)
    return puncs

def delete_tokens(tokens, d_tokens):
    new_tokens = []
    for token in tokens:
        if token not in d_tokens:
            new_tokens.append(token)
    return new_tokens

def find_stopwords(tokens, num):
    stopwords = []
    freq = {}
    for token in tokens:
        if token in freq:
            freq[token] += 1
        else:
            freq[token] = 1
    freq = sorted(freq.items(), key=lambda kv: kv[1], reverse=True)
    return [i for i, _ in freq[:num]]

def stem_tokens(tokens):
    stemmer = Stemmer()
    new_tokens = []
    for token in tokens:
        new_tokens.append(stemmer.stem(token))
    return new_tokens

def prepare_text(raw_text, puncs=[], stopwords=[], stem=False):
    # normalize
    normalizer = Normalizer()
    normalized_text = normalizer.normalize(raw_text)
    # replace half-space
    normalized_text = normalized_text.replace('\u200c', ' ')
    # tokenize
    tokens = []
    sents = sent_tokenize(normalized_text)
    for sent in sents:
        tokens = tokens + word_tokenize(sent)
    # normalize numbers
    tokens = [token if not token.isdigit() else '<NUM>' for token in tokens]
    # delete puncs
    deleted_puncs_tokens = delete_tokens(tokens, puncs)
    # delete stopwords
    deleted_stopwords_tokens = delete_tokens(deleted_puncs_tokens, stopwords)
    # stem
    stemmed_tokens = deleted_stopwords_tokens
    if stem:
        stemmed_tokens = stem_tokens(deleted_stopwords_tokens)
    return stemmed_tokens

In [3]:
import csv

def find_puncs_stopwords(path):
    tokens = []
    with open(path) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
            tokens += prepare_text(" ".join(row[1:]))
    puncs = find_puncs(tokens)
    deleted_puncs_tokens = delete_tokens(tokens, puncs)
    stopwords = find_stopwords(deleted_puncs_tokens, 10)
    return puncs, stopwords

In [4]:
puncs, stopwords = find_puncs_stopwords('train.csv')

In [5]:
labels, texts = [], []
with open('train.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:
        labels.append(row[0])
        text = " ".join(row[1:])
        text = " ".join(prepare_text(text, puncs, stopwords))
        texts.append(text)
        
trainDF = pandas.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels

## Feature extraction

### Count vectors

In [6]:
# split the dataset
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'])

# label encode
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [7]:
count_vect = CountVectorizer(tokenizer=word_tokenize)
count_vect.fit(trainDF['text'])

xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

### TF-IDF

In [8]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(tokenizer=word_tokenize, max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', tokenizer=word_tokenize, ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', tokenizer=word_tokenize, ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

### Word embeding

In [9]:
from gensim.models import KeyedVectors

fa_model = KeyedVectors.load_word2vec_format('wiki.fa.vec')

words = []
for word in fa_model.vocab:
    words.append(word)

print("Number of Tokens: {}".format(len(words)))

Number of Tokens: 420084


In [26]:
token = text.Tokenizer()
token.fit_on_texts(trainDF['text'])
word_index = token.word_index

train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=70)

In [32]:
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = None
    if word in fa_model:
        embedding_vector = fa_model[word]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

## Train and predict

In [9]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    classifier.fit(feature_vector_train, label)
    
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
        
    return metrics.accuracy_score(predictions, valid_y)

### Naive Bayes

In [11]:
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print("NB, Count Vectors: ", accuracy)

accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print("NB, WordLevel TF-IDF: ", accuracy)

accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("NB, N-Gram Vectors: ", accuracy)

accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("NB, CharLevel Vectors: ", accuracy)

NB, Count Vectors:  0.8995666666666666
NB, WordLevel TF-IDF:  0.8919333333333334
NB, N-Gram Vectors:  0.7094
NB, CharLevel Vectors:  0.8538


### Logistic Regression

In [12]:
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print("LR, Count Vectors: ", accuracy)

accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print("LR, WordLevel TF-IDF: ", accuracy)

accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("LR, N-Gram Vectors: ", accuracy)

accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("LR, CharLevel Vectors: ", accuracy)

LR, Count Vectors:  0.9193333333333333
LR, WordLevel TF-IDF:  0.9138
LR, N-Gram Vectors:  0.7321666666666666
LR, CharLevel Vectors:  0.8997666666666667


### SVM

In [11]:
accuracy = train_model(svm.LinearSVC(verbose=True), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("LinearSVC, N-Gram Vectors: ", accuracy)

[LibLinear]LinearSVC, N-Gram Vectors:  0.7347666666666667


### Random Forest

In [13]:
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
print("RF, Count Vectors: ", accuracy)

accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
print("RF, WordLevel TF-IDF: ", accuracy)

RF, Count Vectors:  0.8460333333333333
RF, WordLevel TF-IDF:  0.8564333333333334


### Extereme Gradient Boosting

In [16]:
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
print("Xgb, Count Vectors: ", accuracy)

accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())
print("Xgb, WordLevel TF-IDF: ", accuracy)

  if diff:


Xgb, Count Vectors:  0.8109666666666666
Xgb, WordLevel TF-IDF:  0.8138


  if diff:
