In [None]:
from __future__ import print_function, division
%run ../basics.ipynb

In [None]:
df = pd.read_csv('../data/learning_resources.csv')
display(df.head(2))
tags = pd.read_csv('../data/tags.csv')
display(tags.head(2))
df['tags'] = df['id'].apply(lambda x: (' '.join(tags[tags['id'] == x]['concept_tag'].values.tolist())).strip())
print(df.columns)
print(df.count())
df.head(2)

In [None]:
class Encoder(object):
    def __init__(self, tfidf_max_df=None, tfidf_min_df=None, tag_vec_max_df=None, tag_vec_min_df=None, 
                 ngram_range=None, **kwargs):
        self.vectorizer = CountVectorizer(stop_words='english',
                                          max_df=0.6 if tfidf_max_df is None else tfidf_max_df,
                                          min_df=5 if tfidf_min_df is None else tfidf_min_df,
                                          ngram_range=(1 ,1) if ngram_range is None else ngram_range)
        
        self.nn = None
        self.nn_encode = None
        self.topic_dict = None
        self.kwargs = kwargs
        
    def create_autoencoder(self,X):
        print('X shape:', X.shape)
        input_vec = Input(shape=(X.shape[1], ))
        encoded = Dense(256, activation='relu')(input_vec)
        encoded = Dense(128, activation='relu')(encoded)
        encoded = Dense(64, activation='relu')(encoded)

        decoded = Dense(128, activation='relu')(decoded)
        decoded = Dense(256, activation='relu')(decoded)
        decoded = Dense(X.shape[1], activation='relu')(decoded)
        
        nn = Model(input=input_vec, output=decoded)
        nn.compile(optimizer='adadelta', loss='mse')
        self.nn = nn
        print(nn.summary())
        
        nn = Model(input=input_vec, output=encoded)
        nn.compile(optimizer='adam', loss='mse')
        self.nn_encode = nn

    def fit(self, X, viz=True, *args, **kwargs):            
        orig_X = X.copy()
        X = self.prepare_X(X)
        X = self.vectorize_X(X, fit=True, viz=viz)
        if viz:
            try:
                tsne_plot(X, orig_X[[orig_X.columns[0]]], fit=True)
            except:
                print(traceback.format_exc())
        if self.nn is None:
            self.create_autoencoder(X)
        
        flat_X = np.array(X).flatten()
        print('baseline mean mse:', mean_squared_error(flat_X, np.array([flat_X.mean()]*len(flat_X))))
        start = datetime.datetime.now()
        hist = self.nn.fit(X, X,
                            nb_epoch=30,
                            batch_size=1,
                            shuffle=True,
                            validation_split=.05,
                            verbose=2)
        print('nn training:', datetime.datetime.now() - start)
        if 'val_loss' in hist.history:
            data = zip(hist.history['loss'], hist.history['val_loss'])
            hist_df = pd.DataFrame(columns=['train', 'val'], data=data)
            hist_df.plot(y=['train', 'val'], secondary_y=['val'])
            plt.show()
        else:
            data = hist.history['loss']
            hist_df = pd.DataFrame(columns=['train'], data=data)
            hist_df.plot(y=['train'])
            plt.show()
        
    def predict(self, X):
        X = self.prepare_X(X)
        X = self.vectorize_X(X, fit=False)
        return self.nn_encode.predict(X)
    
    @staticmethod
    def prepare_X(X):
        if isinstance(X, pd.DataFrame):
            concat_x = pd.Series(index=X.index, data='')
            for i in X.columns:
                concat_x += ' ' + X[i].apply(text_cleanup)
            X = concat_x
        else:
            assert isinstance(X, pd.Series)
        return X
        
    def vectorize_X(self, X, fit=False, viz=False):
        if fit:
            tfidf = self.vectorizer.fit_transform(X.values)
        else:
            tfidf = self.vectorizer.transform(X.values)
        tfidf, words = filter_word_rep(tfidf, self.vectorizer.get_feature_names())
        print('tfidf shape', tfidf.shape)
        # tfidf = (tfidf > 0.).astype('float')
        tfidf = normalize(tfidf, axis=1, norm='l1')
        #print(tfidf.sum(axis=1))
        #assert np.allclose(tfidf.sum(axis=1), 1.)
        return tfidf.todense()

In [None]:
encoder = Encoder()
encoder.fit(df[['title', 'subtitle', 'description', 'syllabus']], viz=False)

In [None]:
encoded = encoder.predict(df[['title', 'subtitle', 'description', 'syllabus']])

In [None]:
plt.scatter(encoded[:, 0], encoded[:, 1])
plt.show()