In [1]:
import sklearn
import numpy as np
import tensorflow as tf
import pandas as pd
import seaborn as sb

In [2]:
def read_parse_data():
    data = pd.read_csv('dataset/features.csv', header=None, sep='¡')
    return data

In [85]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

def journal_clasificator_SVM(X_train, X_test, y_train, y_test):
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier(loss='hinge', penalty='l2',
                            alpha=1e-3, random_state=42,
                            max_iter=5, tol=None)),
    ])

    text_clf.fit(X_train, y_train)
    predicted = text_clf.predict(X_test)
    print('mean prediction SVM: ', np.mean(predicted == y_test))



In [23]:

from sklearn.neural_network import MLPClassifier


def journal_clasificator_MLPClassifier(X_train, X_test, y_train, y_test):
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', MLPClassifier(alpha=1, max_iter=1000)),
    ])

    text_clf.fit(X_train, y_train)
    predicted = text_clf.predict(X_test)
    print('mean prediction MLPClassifier: ', np.mean(predicted == y_test))

In [24]:

from sklearn.neighbors import KNeighborsClassifier


def journal_clasificator_KNeighborsClassifier(X_train, X_test, y_train, y_test):
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', KNeighborsClassifier(3)),
    ])

    text_clf.fit(X_train, y_train)
    predicted = text_clf.predict(X_test)
    print('mean prediction KNeighborsClassifier: ', np.mean(predicted == y_test))


In [36]:
from sklearn.svm import SVC
    
def journal_clasificator_SVC(X_train, X_test, y_train, y_test):
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SVC(gamma=2, C=1)),
    ])

    text_clf.fit(X_train, y_train)
    predicted = text_clf.predict(X_test)
    print('mean prediction SVC: ', np.mean(predicted == y_test))

In [26]:

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF


def journal_clasificator_GaussianProcessClassifier(X_train, X_test, y_train, y_test):
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', GaussianProcessClassifier(1.0 * RBF(1.0))),
    ])

    text_clf.fit(X_train, y_train)
    predicted = text_clf.predict(X_test)
    print('mean prediction GaussianProcessClassifier: ', np.mean(predicted == y_test))

In [27]:

from sklearn.tree import DecisionTreeClassifier


def journal_clasificator_DecisionTreeClassifier(X_train, X_test, y_train, y_test):
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', DecisionTreeClassifier(max_depth=5)),
    ])

    text_clf.fit(X_train, y_train)
    predicted = text_clf.predict(X_test)
    print('mean prediction DecisionTreeClassifier: ', np.mean(predicted == y_test))

In [28]:

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier


def journal_clasificator_RandomForestClassifier(X_train, X_test, y_train, y_test):
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
    ])

    text_clf.fit(X_train, y_train)
    predicted = text_clf.predict(X_test)
    print('mean prediction RandomForestClassifier: ', np.mean(predicted == y_test))

def journal_clasificator_AdaBoostClassifier(X_train, X_test, y_train, y_test):
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', AdaBoostClassifier()),
    ])

    text_clf.fit(X_train, y_train)
    predicted = text_clf.predict(X_test)
    print('mean prediction AdaBoostClassifier: ', np.mean(predicted == y_test))



In [29]:

from sklearn.naive_bayes import GaussianNB

def journal_clasificator_GaussianNB(X_train, X_test, y_train, y_test):
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', GaussianNB()),
    ])

    text_clf.fit(X_train, y_train)
    predicted = text_clf.predict(X_test)
    print('mean prediction GaussianNB: ', np.mean(predicted == y_test))

In [76]:

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

def journal_clasificator_QuadraticDiscriminantAnalysis(X_train, X_test, y_train, y_test):
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', QuadraticDiscriminantAnalysis()),
    ])

    text_clf.fit(np.array(X_train), np.array(y_train))
    predicted = text_clf.predict(X_test)
    print('mean prediction QuadraticDiscriminantAnalysis: ', np.mean(predicted == y_test))

In [31]:
from sklearn.naive_bayes import MultinomialNB

def journal_clasificator_NB(X_train, X_test, y_train, y_test):

    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', MultinomialNB()),
    ])

    text_clf.fit(X_train, y_train)
    predicted = text_clf.predict(X_test)
    print('mean prediction naive bayes: ', np.mean(predicted == y_test))


In [14]:
from sklearn import metrics

def print_mretics(predicted, test):
    print(metrics.classification_report(test.target, predicted, target_names=test.target_names))

In [83]:
from sklearn.model_selection import train_test_split

def main():
    data = read_parse_data()

    target = data.iloc[:, 0]
    text = data.iloc[:, 1]

    X_train, X_test, y_train, y_test = train_test_split(text, target, test_size=0.33, random_state=42) 

    X_train, X_test, y_train, y_test = np.array(X_train), np.array(X_test), np.array(y_train), np.array(y_test)

    journal_clasificator_NB(X_train, X_test, y_train, y_test)
    
    journal_clasificator_SVM(X_train, X_test, y_train, y_test)

    journal_clasificator_SVC(X_train, X_test, y_train, y_test)

    # journal_clasificator_QuadraticDiscriminantAnalysis(X_train, X_test, y_train, y_test)

    # journal_clasificator_GaussianNB(X_train, X_test, y_train, y_test)

    journal_clasificator_AdaBoostClassifier(X_train, X_test, y_train, y_test)

    journal_clasificator_RandomForestClassifier(X_train, X_test, y_train, y_test)

    journal_clasificator_DecisionTreeClassifier(X_train, X_test, y_train, y_test)

    # journal_clasificator_GaussianProcessClassifier(X_train, X_test, y_train, y_test)

    journal_clasificator_KNeighborsClassifier(X_train, X_test, y_train, y_test)

    journal_clasificator_MLPClassifier(X_train, X_test, y_train, y_test)
    
    
    print("done")


In [86]:

if __name__ == '__main__':
    main()

  


mean prediction naive bayes:  0.7058823529411765
mean prediction SVM:  0.9215686274509803
mean prediction SVC:  0.6862745098039216
mean prediction AdaBoostClassifier:  0.7450980392156863
mean prediction RandomForestClassifier:  0.6862745098039216
mean prediction DecisionTreeClassifier:  0.7647058823529411
mean prediction KNeighborsClassifier:  0.9411764705882353
mean prediction MLPClassifier:  0.8627450980392157
done
