In [None]:
from data import Data 
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import ShuffleSplit
from plot_learning_curve import plot_learning_curve
import matplotlib.pyplot as plt
import json

%matplotlib inline


def classify(train_x, train_y, test_x, test_y, classifier_params):
    
    cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
    est = DecisionTreeClassifier(**classifier_params)

    #plot_learning_curve(est, "Decision Tree Classifier", train_x, train_y, cv=cv)
    #plt.show()

    est = est.fit(X=train_x, y = train_y)
    pred_y = est.predict(test_x)
    print(classification_report(pred_y, test_y))

In [1]:
EXPERIMENTS = [
{
    "vectorizer_func": "count_vectorized",
    "vectorizer": {
        "ngram_range": (1, 1)
    },
    "data": {
        "normalized": True
    },
    "estimator": {
    }
},
{
    "vectorizer_func": "count_vectorized",
    "vectorizer": {
        "ngram_range": (2, 2)
    },
    "data": {
        "normalized": True
    },
    "estimator": {
    }
},
{
    "vectorizer_func": "tfidf_vectorized",
    "vectorizer": {
        "ngram_range": (1, 1)
    },
    "data": {
        "normalized": True
    },
    "estimator": {
    }
},
{
    "vectorizer_func": "tfidf_vectorized",
    "vectorizer": {
        "ngram_range": (2, 2)
    },
    "data": {
        "normalized": True
    },
    "estimator": {
    }
},
]

In [None]:
for param_set in EXPERIMENTS:
    print("-" * 15)
    print(json.dumps(param_set))
    print("--")
    data = Data(**param_set["data"])
    vectorizer = getattr(data, param_set["vectorizer_func"])
    train_x, train_y, test_x, test_y = vectorizer(**param_set["vectorizer"])
    classify(train_x, train_y, test_x, test_y, param_set["estimator"])
    print("-" * 15)

---------------
{"data": {"normalized": true}, "vectorizer": {"ngram_range": [1, 1]}, "estimator": {}, "vectorizer_func": "count_vectorized"}
--
             precision    recall  f1-score   support

   NEGATIVE       0.78      0.63      0.70       752
       NOTR       0.22      0.38      0.28       164
   POSITIVE       0.56      0.60      0.58       283

avg / total       0.65      0.59      0.61      1199

---------------
---------------
{"data": {"normalized": true}, "vectorizer": {"ngram_range": [2, 2]}, "estimator": {}, "vectorizer_func": "count_vectorized"}
--
             precision    recall  f1-score   support

   NEGATIVE       0.34      0.66      0.45       311
       NOTR       0.12      0.58      0.20        62
   POSITIVE       0.84      0.31      0.45       826

avg / total       0.67      0.41      0.44      1199

---------------
---------------
{"data": {"normalized": true}, "vectorizer": {"ngram_range": [1, 1]}, "estimator": {}, "vectorizer_func": "tfidf_vectorized"}
