In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_files
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.porter import PorterStemmer
from gensim.models.doc2vec import TaggedDocument
import re
import nltk as nl
import gensim
nl.download('stopwords')
import numpy as np
import matplotlib.pyplot as plt
# from sklearn.metrics import classification_report
from matplotlib.colors import ListedColormap as lcm
import inspect, re
porter_stemmer = PorterStemmer()
import multiprocessing


[nltk_data] Downloading package stopwords to /home/saad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def load_data(path):
    dataset = load_files(path)
    return dataset['data'], dataset['target']

def preprocess(docs):
    #stop words removal
    docs = [remove_stopwords(doc) for doc in docs]

    #tokenization
    docs = [gensim.utils.simple_preprocess(doc) for doc in docs] 

    #stemming
    docs = [[porter_stemmer.stem(word) for word in doc] for doc in docs]

    return docs


def tfidf_vectorization(docs, decode):
    # convert X from a list of tokens to a string, because 
    # fit_transform and transform functions below don't accept list of words
    docs = [' '.join(doc) for doc in docs]    

    tfidf_representer = TfidfVectorizer(stop_words = nl.corpus.stopwords.words('english'))

    if decode:
        docs = tfidf_representer.fit_transform(docs)
    else:
        docs = tfidf_representer.transform(docs)
    return docs

def build_evaluate_logistic_model(X_train, y_train, X_test, y_test, solver, dual, C, max_iter=1000):
    model = LogisticRegression(solver = solver, dual = dual, C = C, tol=1e-3, max_iter=max_iter)
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    print("Train Accuracy : ",accuracy_score(y_train,y_pred_train))
    print("Test  Accuracy : ",accuracy_score(y_test,y_pred_test),'\n')
    return

def build_evaluate_SVM_model(X_train, y_train, X_test, y_test, kernel):
    model = svm.SVC(kernel=kernal)
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    print("Train Accuracy : ",accuracy_score(y_train,y_pred_train))
    print("Test  Accuracy : ",accuracy_score(y_test,y_pred_test),'\n')
    return   

def build_evaluate_LinearSVM_model(X_train, y_train, X_test, y_test, tol , C, max_iter=10000):
    model = svm.LinearSVC(tol = 1e-3, C = C, max_iter=max_iter)
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    print("Train Accuracy : ",accuracy_score(y_train,y_pred_train))
    print("Test  Accuracy : ",accuracy_score(y_test,y_pred_test),'\n')
    return 

def word2vec_vectorization(doc_words, model):
    temp = np.empty(2000, dtype=object)
    doc_vec = []
    res = np.empty((2000,), dtype=object)
    for j in range(len(doc_words)):
        try:
            word_vec = model.wv[doc_words[j]]
            doc_vec.append(word_vec)
        except KeyError:    # Ignore, if the word doesn't exist in the vocabulary
            pass
    res = np.mean(doc_vec, axis=0)  #return 1d vec of len 100
    return res


# Step 1
# load the files and preprocessing them

In [5]:
#load data from files
X, y = load_data("txt_sentoken")

#preprossing
X = preprocess(X)

#doc2vec model require the data to be TaggedDocument before training on it
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(X)]

# Step 2
# Word Embedding models training
- Train the doc2vec, word2vec skipgram, word2vec CBOW models
- Note: You need to run this cell only once on your computer
- It will take some time...

In [22]:
doc2vec_model = gensim.models.doc2vec.Doc2Vec(documents, vector_size=100, min_count=1, epochs=40, workers=multiprocessing.cpu_count(), window=3)
doc2vec_model.save("doc2vec_100_1_40_3")

word2vec_skipgram_model = gensim.models.Word2Vec(sentences=X, size=100, window=3, sg=1, min_count=1, workers=multiprocessing.cpu_count(), iter=40)
word2vec_skipgram_model.save("skipgram_100_3_1_40.model")

word2vec_CBOW_model = gensim.models.Word2Vec(sentences=X, size=100, window=3, sg=0, min_count=1, workers=multiprocessing.cpu_count(), iter=40)
word2vec_CBOW_model.save("CBOW_100_3_1_40.model")

# Step 3
# Load the trained Word Embedding Models

In [8]:
#load the doc2vec skipgram model
doc2vec_model = gensim.models.Doc2Vec.load("doc2vec_100_1_40_3")

#load the word2vec skipgram model
word2vec_skipgram_model = gensim.models.Word2Vec.load("skipgram_100_3_1_40.model")

#load the word2vec CBOW model
word2vec_CBOW_model = gensim.models.Word2Vec.load("CBOW_100_3_1_40.model")

# Step 4
# Vectorization
- It will take some time...

In [9]:
#tfidf vectorization
X_tfidf = tfidf_vectorization(X, True)

#doc2vec vectorization
X_doc2vec = [doc2vec_model.infer_vector(doc) for doc in X]

#word2vec skipgram vectorization
X_word2vec_skipgram = [word2vec_vectorization(doc, word2vec_skipgram_model) for doc in X]

#word2vec CBOW vectorization
X_word2vec_CBOW = [word2vec_vectorization(doc, word2vec_CBOW_model) for doc in X]

# Step 5
# Train Test Split

In [10]:
# tfidf train test split
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, y, test_size = 0.2, random_state = 0, stratify = y)
unique, counts = np.unique(y_train_tfidf, return_counts=True)

# doc2vec train test split
X_train_doc2vec, X_test_doc2vec, y_train_doc2vec, y_test_doc2vec = train_test_split(X_doc2vec, y, test_size = 0.2, random_state = 0, stratify = y)
unique, counts = np.unique(y_train_doc2vec, return_counts=True)

# word2vec skipgram train test split
X_train_word2vec_skipgram, X_test_word2vec_skipgram, y_train_word2vec_skipgram, y_test_word2vec_skipgram = train_test_split(X_word2vec_skipgram, y, test_size = 0.2, random_state = 0, stratify = y)
unique, counts = np.unique(y_train_word2vec_skipgram, return_counts=True)

# word2vec CBOW train test split
X_train_word2vec_CBOW, X_test_word2vec_CBOW, y_train_word2vec_CBOW, y_test_word2vec_CBOW = train_test_split(X_word2vec_CBOW, y, test_size = 0.2, random_state = 0, stratify = y)
unique, counts = np.unique(y_train_word2vec_CBOW, return_counts=True)

# Step 6
# Vectors Normalization

In [11]:
scaler = StandardScaler()

#no normalization for tfidf vectors, because tfidf vectorization function do the normalization for us

#doc2vec vector normalization
scaler.fit(X_train_doc2vec)
X_train_doc2vec = scaler.transform(X_train_doc2vec)
X_test_doc2vec = scaler.transform(X_test_doc2vec)

#word2vec skipgram vector normalization
scaler.fit(X_train_word2vec_skipgram)
X_train_word2vec_skipgram = scaler.transform(X_train_word2vec_skipgram)
X_test_word2vec_skipgram = scaler.transform(X_test_word2vec_skipgram)

#word2vec CBOW vector normalization
scaler.fit(X_train_word2vec_CBOW)
X_train_word2vec_CBOW = scaler.transform(X_train_word2vec_CBOW)
X_test_word2vec_CBOW = scaler.transform(X_test_word2vec_CBOW)

# Step 7
# Classification

In [13]:

print('\n------------------ TF-IDF vectors Classification ------------------------\n')


print('\n---------------------- Logistic Regression ------------------------------')

print('\n--------------------------- liblinear --------------------------------\n')
build_evaluate_logistic_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, 'liblinear', True, 1)

build_evaluate_logistic_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, 'liblinear', True, 0.9)

build_evaluate_logistic_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, 'liblinear', True, 0.5)

build_evaluate_logistic_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, 'liblinear', True, 0.000000000000000001)

build_evaluate_logistic_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, 'liblinear', True, 0.2)


print('\n--------------------------- lbfgs --------------------------------\n')
build_evaluate_logistic_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, 'lbfgs', False, 1)

build_evaluate_logistic_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, 'lbfgs', False, 0.9)

build_evaluate_logistic_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, 'lbfgs', False, 0.5)

build_evaluate_logistic_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, 'lbfgs', False, 0.000000000000000001)

build_evaluate_logistic_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, 'lbfgs', False, 0.2)


print('\n--------------------------- saga --------------------------------\n')
build_evaluate_logistic_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, 'saga', False, 1)

build_evaluate_logistic_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, 'saga', False, 0.9)

build_evaluate_logistic_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, 'saga', False, 0.5)

build_evaluate_logistic_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, 'saga', False, 0.000000000000000001)

build_evaluate_logistic_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, 'saga', False, 0.2)


print('\n--------------------------- newton-cg --------------------------------\n')
build_evaluate_logistic_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, 'newton-cg', False, 1)

build_evaluate_logistic_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, 'newton-cg', False, 0.9)

build_evaluate_logistic_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, 'newton-cg', False, 0.5)

build_evaluate_logistic_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, 'newton-cg', False, 0.000000000000000001)

build_evaluate_logistic_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, 'newton-cg', False, 0.2)


print('\n----------------------- Linear SVM --------------------------------')
build_evaluate_LinearSVM_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, tol = 1e-5, C = 1)
build_evaluate_LinearSVM_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, tol = 1e-5, C = 0.9)
build_evaluate_LinearSVM_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, tol = 1e-5, C = 0.5)
build_evaluate_LinearSVM_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, tol = 1e-5, C = 0.000000000000000001)
build_evaluate_LinearSVM_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, tol = 1e-5, C = 0.2)
build_evaluate_LinearSVM_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, tol = 1e-3, C = 1)
build_evaluate_LinearSVM_model(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, tol = 1e-1, C = 0.1)


------------------ TF-IDF vectors Classification ------------------------


---------------------- Logistic Regression ------------------------------

--------------------------- liblinear --------------------------------

Train Accuracy :  0.97625
Test  Accuracy :  0.795 

Train Accuracy :  0.9725
Test  Accuracy :  0.7975 

Train Accuracy :  0.958125
Test  Accuracy :  0.785 

Train Accuracy :  0.91
Test  Accuracy :  0.7575 

Train Accuracy :  0.933125
Test  Accuracy :  0.7875 


--------------------------- lbfgs --------------------------------

Train Accuracy :  0.97625
Test  Accuracy :  0.795 

Train Accuracy :  0.9725
Test  Accuracy :  0.7975 

Train Accuracy :  0.958125
Test  Accuracy :  0.785 

Train Accuracy :  0.5
Test  Accuracy :  0.5 

Train Accuracy :  0.933125
Test  Accuracy :  0.7875 


--------------------------- saga --------------------------------

Train Accuracy :  0.97625
Test  Accuracy :  0.7975 

Train Accuracy :  0.973125
Test  Accuracy :  0.7975 

Train Accuracy

In [14]:

print('\n------------------ doc2vec vectors Classification ------------------------\n')


print('\n---------------------- Logistic Regression ------------------------------')

print('\n--------------------------- liblinear --------------------------------\n')
build_evaluate_logistic_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, 'liblinear', True, 1)

build_evaluate_logistic_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, 'liblinear', True, 0.9)

build_evaluate_logistic_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, 'liblinear', True, 0.5)

build_evaluate_logistic_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, 'liblinear', True, 0.000000000000000001)

build_evaluate_logistic_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, 'liblinear', True, 0.2)


print('\n--------------------------- lbfgs --------------------------------\n')
build_evaluate_logistic_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, 'lbfgs', False, 1)

build_evaluate_logistic_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, 'lbfgs', False, 0.9)

build_evaluate_logistic_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, 'lbfgs', False, 0.5)

build_evaluate_logistic_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, 'lbfgs', False, 0.000000000000000001)

build_evaluate_logistic_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, 'lbfgs', False, 0.2)


print('\n--------------------------- saga --------------------------------\n')
build_evaluate_logistic_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, 'saga', False, 1)

build_evaluate_logistic_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, 'saga', False, 0.9)

build_evaluate_logistic_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, 'saga', False, 0.5)

build_evaluate_logistic_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, 'saga', False, 0.000000000000000001)

build_evaluate_logistic_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, 'saga', False, 0.2)


print('\n--------------------------- newton-cg --------------------------------\n')
build_evaluate_logistic_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, 'newton-cg', False, 1)

build_evaluate_logistic_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, 'newton-cg', False, 0.9)

build_evaluate_logistic_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, 'newton-cg', False, 0.5)

build_evaluate_logistic_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, 'newton-cg', False, 0.000000000000000001)

build_evaluate_logistic_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, 'newton-cg', False, 0.2)


print('\n----------------------- Linear SVM --------------------------------')
build_evaluate_LinearSVM_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, tol = 1e-5, C = 1)
build_evaluate_LinearSVM_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, tol = 1e-5, C = 0.9)
build_evaluate_LinearSVM_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, tol = 1e-5, C = 0.5)
build_evaluate_LinearSVM_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, tol = 1e-5, C = 0.000000000000000001)
build_evaluate_LinearSVM_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, tol = 1e-5, C = 0.2)
build_evaluate_LinearSVM_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, tol = 1e-3, C = 1)
build_evaluate_LinearSVM_model(X_train_doc2vec, y_train_doc2vec, X_test_doc2vec, y_test_doc2vec, tol = 1e-1, C = 0.1)



------------------ doc2vec vectors Classification ------------------------


---------------------- Logistic Regression ------------------------------

--------------------------- liblinear --------------------------------

Train Accuracy :  0.87
Test  Accuracy :  0.795 

Train Accuracy :  0.87
Test  Accuracy :  0.795 

Train Accuracy :  0.869375
Test  Accuracy :  0.795 

Train Accuracy :  0.798125
Test  Accuracy :  0.7525 

Train Accuracy :  0.871875
Test  Accuracy :  0.7925 


--------------------------- lbfgs --------------------------------

Train Accuracy :  0.870625
Test  Accuracy :  0.795 

Train Accuracy :  0.87
Test  Accuracy :  0.795 

Train Accuracy :  0.869375
Test  Accuracy :  0.795 

Train Accuracy :  0.798125
Test  Accuracy :  0.7525 

Train Accuracy :  0.871875
Test  Accuracy :  0.7925 


--------------------------- saga --------------------------------

Train Accuracy :  0.870625
Test  Accuracy :  0.795 

Train Accuracy :  0.87
Test  Accuracy :  0.795 

Train Accuracy

In [15]:

print('\n------------------ word2vec skipgram vectors Classification ------------------------\n')


print('\n------------------------- Logistic Regression ------------------------------')

print('\n----------------------------- liblinear --------------------------------\n')
build_evaluate_logistic_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, 'liblinear', True, 1)

build_evaluate_logistic_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, 'liblinear', True, 0.9)

build_evaluate_logistic_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, 'liblinear', True, 0.5)

build_evaluate_logistic_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, 'liblinear', True, 0.000000000000000001)

build_evaluate_logistic_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, 'liblinear', True, 0.2)


print('\n--------------------------- lbfgs --------------------------------\n')
build_evaluate_logistic_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, 'lbfgs', False, 1)

build_evaluate_logistic_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, 'lbfgs', False, 0.9)

build_evaluate_logistic_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, 'lbfgs', False, 0.5)

build_evaluate_logistic_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, 'lbfgs', False, 0.000000000000000001)

build_evaluate_logistic_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, 'lbfgs', False, 0.2)


print('\n--------------------------- saga --------------------------------\n')
build_evaluate_logistic_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, 'saga', False, 1)

build_evaluate_logistic_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, 'saga', False, 0.9)

build_evaluate_logistic_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, 'saga', False, 0.5)

build_evaluate_logistic_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, 'saga', False, 0.000000000000000001)

build_evaluate_logistic_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, 'saga', False, 0.2)


print('\n--------------------------- newton-cg --------------------------------\n')
build_evaluate_logistic_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, 'newton-cg', False, 1)

build_evaluate_logistic_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, 'newton-cg', False, 0.9)

build_evaluate_logistic_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, 'newton-cg', False, 0.5)

build_evaluate_logistic_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, 'newton-cg', False, 0.000000000000000001)

build_evaluate_logistic_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, 'newton-cg', False, 0.2)


print('\n----------------------- Linear SVM --------------------------------')
build_evaluate_LinearSVM_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, tol = 1e-5, C = 1)
build_evaluate_LinearSVM_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, tol = 1e-5, C = 0.9)
build_evaluate_LinearSVM_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, tol = 1e-5, C = 0.5)
build_evaluate_LinearSVM_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, tol = 1e-5, C = 0.000000000000000001)
build_evaluate_LinearSVM_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, tol = 1e-5, C = 0.2)
build_evaluate_LinearSVM_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, tol = 1e-3, C = 1)
build_evaluate_LinearSVM_model(X_train_word2vec_skipgram, y_train_word2vec_skipgram, X_test_word2vec_skipgram, y_test_word2vec_skipgram, tol = 1e-1, C = 0.1)



------------------ word2vec skipgram vectors Classification ------------------------


------------------------- Logistic Regression ------------------------------

----------------------------- liblinear --------------------------------

Train Accuracy :  0.841875
Test  Accuracy :  0.76 

Train Accuracy :  0.841875
Test  Accuracy :  0.76 

Train Accuracy :  0.8425
Test  Accuracy :  0.7625 

Train Accuracy :  0.760625
Test  Accuracy :  0.6975 

Train Accuracy :  0.844375
Test  Accuracy :  0.7725 


--------------------------- lbfgs --------------------------------

Train Accuracy :  0.841875
Test  Accuracy :  0.76 

Train Accuracy :  0.841875
Test  Accuracy :  0.76 

Train Accuracy :  0.8425
Test  Accuracy :  0.7625 

Train Accuracy :  0.760625
Test  Accuracy :  0.6975 

Train Accuracy :  0.844375
Test  Accuracy :  0.7725 


--------------------------- saga --------------------------------

Train Accuracy :  0.841875
Test  Accuracy :  0.76 

Train Accuracy :  0.841875
Test  Accuracy :

In [16]:

print('\n------------------ word2vec CBOW vectors Classification ------------------------\n')


print('\n---------------------- Logistic Regression ------------------------------')

print('\n--------------------------- liblinear --------------------------------\n')
build_evaluate_logistic_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, 'liblinear', True, 1)

build_evaluate_logistic_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, 'liblinear', True, 0.9)

build_evaluate_logistic_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, 'liblinear', True, 0.5)

build_evaluate_logistic_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, 'liblinear', True, 0.000000000000000001)

build_evaluate_logistic_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, 'liblinear', True, 0.2)


print('\n--------------------------- lbfgs --------------------------------\n')
build_evaluate_logistic_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, 'lbfgs', False, 1)

build_evaluate_logistic_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, 'lbfgs', False, 0.9)

build_evaluate_logistic_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, 'lbfgs', False, 0.5)

build_evaluate_logistic_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, 'lbfgs', False, 0.000000000000000001)

build_evaluate_logistic_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, 'lbfgs', False, 0.2)


print('\n--------------------------- saga --------------------------------\n')
build_evaluate_logistic_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, 'saga', False, 1)

build_evaluate_logistic_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, 'saga', False, 0.9)

build_evaluate_logistic_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, 'saga', False, 0.5)

build_evaluate_logistic_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, 'saga', False, 0.000000000000000001)

build_evaluate_logistic_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, 'saga', False, 0.2)


print('\n--------------------------- newton-cg --------------------------------\n')
build_evaluate_logistic_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, 'newton-cg', False, 1)

build_evaluate_logistic_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, 'newton-cg', False, 0.9)

build_evaluate_logistic_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, 'newton-cg', False, 0.5)

build_evaluate_logistic_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, 'newton-cg', False, 0.000000000000000001)

build_evaluate_logistic_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, 'newton-cg', False, 0.2)


print('\n-------------------------- Linear SVM --------------------------------')
build_evaluate_LinearSVM_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, tol = 1e-5, C = 1)
build_evaluate_LinearSVM_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, tol = 1e-5, C = 0.9)
build_evaluate_LinearSVM_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, tol = 1e-5, C = 0.5)
build_evaluate_LinearSVM_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, tol = 1e-5, C = 0.000000000000000001)
build_evaluate_LinearSVM_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, tol = 1e-5, C = 0.2)
build_evaluate_LinearSVM_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, tol = 1e-3, C = 1)
build_evaluate_LinearSVM_model(X_train_word2vec_CBOW, y_train_word2vec_CBOW, X_test_word2vec_CBOW, y_test_word2vec_CBOW, tol = 1e-1, C = 0.1)



------------------ word2vec CBOW vectors Classification ------------------------


---------------------- Logistic Regression ------------------------------

--------------------------- liblinear --------------------------------

Train Accuracy :  0.835
Test  Accuracy :  0.7775 

Train Accuracy :  0.834375
Test  Accuracy :  0.7775 

Train Accuracy :  0.835625
Test  Accuracy :  0.7775 

Train Accuracy :  0.724375
Test  Accuracy :  0.67 

Train Accuracy :  0.8325
Test  Accuracy :  0.78 


--------------------------- lbfgs --------------------------------

Train Accuracy :  0.835
Test  Accuracy :  0.7775 

Train Accuracy :  0.834375
Test  Accuracy :  0.7775 

Train Accuracy :  0.835625
Test  Accuracy :  0.7775 

Train Accuracy :  0.724375
Test  Accuracy :  0.67 

Train Accuracy :  0.8325
Test  Accuracy :  0.78 


--------------------------- saga --------------------------------

Train Accuracy :  0.834375
Test  Accuracy :  0.7775 

Train Accuracy :  0.834375
Test  Accuracy :  0.7775 

Tr

# APPENDIX

In [59]:
svm_model1 = svm.SVC(kernel='linear')

build_evaluate_SVM_model(X_train, y_train, X_test, y_test)

Train Accuracy :  0.99375
Test  Accuracy :  0.815 



In [None]:
#ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
#https://github.com/scikit-learn/scikit-learn/issues/11536

#https://medium.com/swlh/sentiment-classification-for-reviews-using-doc2vec-660ba594c336#_=_