In [6]:
from data import Data 
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import ShuffleSplit
from plot_learning_curve import plot_learning_curve
import matplotlib.pyplot as plt
import json

%matplotlib inline

def classify(train_x, train_y, test_x, test_y, classifier_params):
    
    cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
    est = SGDClassifier(loss="hinge", penalty="l2", max_iter=100, **classifier_params)

    #plot_learning_curve(est, "Stochastic Gradient Descent", train_x, train_y, cv=cv)
    #plt.show()
    
    est = est.fit(X=train_x, y = train_y)
    pred_y = est.predict(test_x)
    print(classification_report(pred_y, test_y))

In [7]:
EXPERIMENTS = [
{
    "vectorizer_func": "count_vectorized",
    "vectorizer": {
        "ngram_range": (1, 1)
    },
    "data": {
        "normalized": True
    },
    "estimator": {
    }
},
{
    "vectorizer_func": "count_vectorized",
    "vectorizer": {
        "ngram_range": (2, 2)
    },
    "data": {
        "normalized": True
    },
    "estimator": {
    }
},
{
    "vectorizer_func": "tfidf_vectorized",
    "vectorizer": {
        "ngram_range": (1, 1)
    },
    "data": {
        "normalized": True
    },
    "estimator": {
    }
},
{
    "vectorizer_func": "tfidf_vectorized",
    "vectorizer": {
        "ngram_range": (2, 2)
    },
    "data": {
        "normalized": True
    },
    "estimator": {
    }
},
]

In [9]:
for param_set in EXPERIMENTS:
    print("-" * 15)
    print(json.dumps(param_set))
    print("--")
    data = Data(**param_set["data"])
    vectorizer = getattr(data, param_set["vectorizer_func"])
    train_x, train_y, test_x, test_y = vectorizer(**param_set["vectorizer"])
    classify(train_x, train_y, test_x, test_y, param_set["estimator"])
    print("-" * 15)

---------------
{"estimator": {}, "vectorizer_func": "count_vectorized", "data": {"normalized": true}, "vectorizer": {"ngram_range": [1, 1]}}
--
             precision    recall  f1-score   support

   NEGATIVE       0.75      0.69      0.72       657
       NOTR       0.32      0.41      0.36       222
   POSITIVE       0.61      0.58      0.60       320

avg / total       0.63      0.61      0.62      1199

---------------
---------------
{"estimator": {}, "vectorizer_func": "count_vectorized", "data": {"normalized": true}, "vectorizer": {"ngram_range": [2, 2]}}
--
             precision    recall  f1-score   support

   NEGATIVE       0.91      0.62      0.73       892
       NOTR       0.10      0.52      0.17        58
   POSITIVE       0.52      0.63      0.57       249

avg / total       0.79      0.61      0.67      1199

---------------
---------------
{"estimator": {}, "vectorizer_func": "tfidf_vectorized", "data": {"normalized": true}, "vectorizer": {"ngram_range": [1, 1]}}
