In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# borrowed from http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.decomposition import TruncatedSVD

  from numpy.core.umath_tests import inner1d


In [2]:
sm = pd.read_json("../data_fetching/json2/SequelMemes.json")
pm = pd.read_json("../data_fetching/json2/PrequelMemes.json")

In [3]:
sm_titles = sm[["title"]]
pm_titles = pm[["title"]]

sm_titles["is_sequel_meme"] = 1
pm_titles["is_sequel_meme"] = 0

meme_titles = pd.concat([pm_titles,sm_titles])
print(meme_titles.head())
print(meme_titles.tail())

                                title  is_sequel_meme
0                      Drunk Politics               0
1                 When the Fun Begins               0
2                      Just one Windu               0
3  dlmoisttlotjidnftdsaydihbpjfastmne               0
4                     Drunk Democracy               0
                                                   title  is_sequel_meme
14595                        His swoleness got him #6!!!               1
14596  Looks like someone at my local brewery is a Se...               1
14597  MAGA.... Nah! Time to make the Republic great ...               1
14598           Take On Me except it's Leia slapping Poe               1
14599           Take On Me except it's Leia slapping Poe               1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [4]:
corpus = [title[0] for title in meme_titles[["title"]].values]

In [5]:
# Borrowed from http://scikit-learn.org/stable/modules/feature_extraction.html
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [None]:
vectorizers_start = {
#     "cvec" : CountVectorizer,
    "tfidf" : TfidfVectorizer,
}

vectorizers = {}

for key,vec in vectorizers_start.items():
    vectorizers[key] = vec(stop_words='english', tokenizer=LemmaTokenizer())
    for max_df in (0.25, 0.5, 0.75):
        vectorizers[f"{key}__max_df_of_{max_df}"] = vec(stop_words='english', max_df=max_df)
        for n_gram_range in [(1, 1), (1, 2), (1, 3), (1, 4)]:
            vectorizers[f"{key}__n_gram_range_of_{n_gram_range}"] = vec(stop_words='english', ngram_range=n_gram_range)
            vectorizers[f"{key}__n_gram_range_of_{n_gram_range}_max_df_of{max_df}"] = vec(stop_words='english', ngram_range=n_gram_range, max_df=max_df)

classifiers = {
    "bnb" : BernoulliNB(), #=> Fast and good enough score
#     "mnb" : MultinomialNB(), #=> Fast and good enough score
#     "logr": LogisticRegression(), #=> Best score, but super slow
#     "knn" : KNeighborsClassifier(), #=> Super overfit, not great scores
#     "tree" : DecisionTreeClassifier(),
#     "rfc" : RandomForestClassifier(), #=> Very overfit, not much better than Naive Bayes
#     "gbc" : GradientBoostingClassifier(), #=> Not great, worse than Naive Bayes
#     "ada" : AdaBoostClassifier(), #=> Not great, worse than Naive Bayes
#     "svm" : SVC(), #=> painstakingly slow, couldn't run on my computer :/
}

# Borrowed from https://www.kaggle.com/mayu0116/hyper-parameters-tuning-of-dtree-rf-svm-knn
# and https://optunity.readthedocs.io/en/stable/notebooks/notebooks/sklearn-automated-classification.html
hyper_parameters = {
    "bnb" : {
        "alpha": np.linspace(0.0,1.0,10)
    },
    "mnb" : {
        "alpha": np.linspace(0.0,1.0,10)
    },
    "logr": {
        "penalty" : ['l1', 'l2'],
        "C": np.logspace(0, 10, 20)
    },
    "knn" : {
        'n_neighbors':[1,2,3,4,5],
        'weights':['uniform', 'distance'],
    },
    "svm" : {
        'kernel': ['linear', 'poly', 'rbf'],
        'C': [1, 2, 10, 50],
        'gamma': [0, 1],
        'degree': [2, 5],
        'coef0': [0, 1]
    },
    "tree": {
        
    },
    "rfc":{
        
    },
    "gbc":{
        "n_estimators" : [50,100,150]
    },
    "ada":{
        "base_estimator": [BernoulliNB(), MultinomialNB(), LogisticRegression()],
        "n_estimators" : [50,100,150]
    }
}

for key,val in vectorizers.items():
    print(f"Fitting {key}")
    val.fit(corpus)
    # Transform the corpus
    new_corpus  = val.transform(corpus)
    sparse_df = pd.SparseDataFrame(new_corpus, columns=val.get_feature_names())
    sparse_df.fillna(0, inplace=True)
    SVD = TruncatedSVD(n_components=1000)
    svd_matrix = SVD.fit_transform(sparse_df)
    component_names = ["component_"+str(i+1) for i in range(1000)]
    svd_df = pd.DataFrame(svd_matrix,
                      columns=component_names)
    
    y = meme_titles[["is_sequel_meme"]]
    X_train, X_test, y_train, y_test = train_test_split(svd_df, y)
    for class_key, classifier in classifiers.items():
        print(f"Scores for {key} using {class_key} Classifier")
        gs = GridSearchCV(classifier, param_grid=hyper_parameters[class_key], n_jobs=-1);
        gs.fit(X_train,y_train);
        print(f"Train data: {gs.score(X_train, y_train)}")
        print(f"{gs.best_estimator_}")
        print(f"{gs.best_params_}")
        print(f"Test data: {gs.score(X_test, y_test)}")
        print(f"{gs.best_estimator_}")
        print(f"{gs.best_params_}")
        [print() for i in range(0,3)]

Fitting tfidf
Scores for tfidf using bnb Classifier


  y = column_or_1d(y, warn=True)
  'setting alpha = %.1e' % _ALPHA_MIN)


Train data: 0.6624694786230669
BernoulliNB(alpha=0.0, binarize=0.0, class_prior=None, fit_prior=True)
{'alpha': 0.0}
Test data: 0.6313370673560247
BernoulliNB(alpha=0.0, binarize=0.0, class_prior=None, fit_prior=True)
{'alpha': 0.0}



Fitting tfidf__max_df_of_0.25
Scores for tfidf__max_df_of_0.25 using bnb Classifier


  y = column_or_1d(y, warn=True)
  'setting alpha = %.1e' % _ALPHA_MIN)


Train data: 0.6540431847560684
BernoulliNB(alpha=0.0, binarize=0.0, class_prior=None, fit_prior=True)
{'alpha': 0.0}
Test data: 0.6218583943702427
BernoulliNB(alpha=0.0, binarize=0.0, class_prior=None, fit_prior=True)
{'alpha': 0.0}



Fitting tfidf__n_gram_range_of_(1, 1)
Scores for tfidf__n_gram_range_of_(1, 1) using bnb Classifier


  y = column_or_1d(y, warn=True)
  'setting alpha = %.1e' % _ALPHA_MIN)


Train data: 0.6555273615167329
BernoulliNB(alpha=0.0, binarize=0.0, class_prior=None, fit_prior=True)
{'alpha': 0.0}
Test data: 0.6293264397529801
BernoulliNB(alpha=0.0, binarize=0.0, class_prior=None, fit_prior=True)
{'alpha': 0.0}



Fitting tfidf__n_gram_range_of_(1, 1)_max_df_of0.25
Scores for tfidf__n_gram_range_of_(1, 1)_max_df_of0.25 using bnb Classifier


  y = column_or_1d(y, warn=True)


Train data: 0.6496385311437737
BernoulliNB(alpha=0.4444444444444444, binarize=0.0, class_prior=None,
      fit_prior=True)
{'alpha': 0.4444444444444444}
Test data: 0.634496625017952
BernoulliNB(alpha=0.4444444444444444, binarize=0.0, class_prior=None,
      fit_prior=True)
{'alpha': 0.4444444444444444}



Fitting tfidf__n_gram_range_of_(1, 2)
