In [16]:
from data import Data 
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import ShuffleSplit
from plot_learning_curve import plot_learning_curve
import matplotlib.pyplot as plt
import json

%matplotlib inline

data = Data()

def classify(train_x, train_y, test_x, test_y, classifier_params):
    
    cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
    est = GaussianNB(**classifier_params)

    #plot_learning_curve(est, "Naive Bayes", train_x, train_y, cv=cv)
   # plt.show()

    est = est.fit(X=train_x, y = train_y)
    pred_y = est.predict(test_x)
    print(classification_report(pred_y, test_y))

In [17]:
EXPERIMENTS = [
{
    "vectorizer_func": "count_vectorized",
    "vectorizer": {
        "ngram_range": (1, 1)
    },
    "data": {
        "normalized": True
    },
    "estimator": {
    }
},
{
    "vectorizer_func": "count_vectorized",
    "vectorizer": {
        "ngram_range": (2, 2)
    },
    "data": {
        "normalized": True
    },
    "estimator": {
    }
},
{
    "vectorizer_func": "tfidf_vectorized",
    "vectorizer": {
        "ngram_range": (1, 1)
    },
    "data": {
        "normalized": True
    },
    "estimator": {
    }
},
{
    "vectorizer_func": "tfidf_vectorized",
    "vectorizer": {
        "ngram_range": (2, 2)
    },
    "data": {
        "normalized": True
    },
    "estimator": {
    }
},
]

In [18]:
for param_set in EXPERIMENTS:
    print("-" * 15)
    print(json.dumps(param_set))
    print("--")
    data = Data(**param_set["data"])
    vectorizer = getattr(data, param_set["vectorizer_func"])
    train_x, train_y, test_x, test_y = vectorizer(**param_set["vectorizer"])
    classify(train_x, train_y, test_x, test_y, param_set["estimator"])
    print("-" * 15)

---------------
{"vectorizer": {"ngram_range": [1, 1]}, "estimator": {}, "data": {"normalized": true}, "vectorizer_func": "count_vectorized"}
--
             precision    recall  f1-score   support

   NEGATIVE       0.49      0.63      0.55       474
       NOTR       0.26      0.27      0.26       277
   POSITIVE       0.56      0.38      0.45       448

avg / total       0.46      0.45      0.45      1199

---------------
---------------
{"vectorizer": {"ngram_range": [2, 2]}, "estimator": {}, "data": {"normalized": true}, "vectorizer_func": "count_vectorized"}
--
             precision    recall  f1-score   support

   NEGATIVE       0.38      0.71      0.49       321
       NOTR       0.67      0.28      0.40       688
   POSITIVE       0.35      0.55      0.43       190

avg / total       0.54      0.44      0.43      1199

---------------
---------------
{"vectorizer": {"ngram_range": [1, 1]}, "estimator": {}, "data": {"normalized": true}, "vectorizer_func": "tfidf_vectorized"}
