In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle, resample

from nltk.stem import PorterStemmer

from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing import sequence

from sklearn.metrics import precision_score, f1_score

from sklearn.base import BaseEstimator

from sklearn.model_selection import KFold
from sklearn.utils import shuffle

from imblearn.over_sampling import SMOTE

import matplotlib.animation as animation
import seaborn as sns


In [None]:
class Data():

    def __init__(self, file_name):
        self.file_name = file_name

    def get_data(self, dtype={'isPaidContent': np.int8, 'pageTags': np.str}):
        self.data = pd.read_csv(self.file_name, dtype=dtype)
        return self.data

    def process_data(self, data=None):
        if data is None:
            data = self.data
        # Shuffeling the data
        data = shuffle(data)

        # Getting
        X = data.pageTags
        Y = data.isPaidContent

        X_train, X_test, Y_train, Y_test = train_test_split(
            X, Y, test_size=0.15)

        print('X.shape ', X_train.shape, ' Y.shape ', Y_train.shape,
              ' x.shape ', X_test.shape, ' y.shape ', Y_test.shape)
        return X_train, X_test, np.array(Y_train), np.array(Y_test)

    @staticmethod
    def resample_data(X, Y, type=None):
        if type == 'SMOTE':
            print('Resampling data : SMOTE...')

            smote = SMOTE(ratio=1.0)
            X_smote, Y_smote = smote.fit_sample(X, Y)

            print('Data', X.shape, ' resampled to ', X_smote.shape)
            return X_smote, Y_smote

        elif type == 'RESAMPLE':
            print('Resampling data : Up Sampling...')
            # Postive data
            pos = np.where(Y == 1)
            pos_Y = Y[pos]
            pos_X = X[pos]

            # negative data
            neg = np.where(Y == 0)
            neg_Y = Y[neg]
            neg_X = X[neg]

            up_neg_X, up_neg_Y = resample(neg_X, neg_Y, n_samples=len(pos_Y))

            print('Negative class data ', neg_X.shape, ' is UP Sampled to : ', up_neg_X.shape,
                  ' as same positive class data ', pos_X.shape)

            return np.r_[pos_X, up_neg_X], np.r_[pos_Y, up_neg_Y]
        else:
            print('Data is not Resampled, as type = SMOTE/RESAMPLE is not given ')
            return X, Y


class DataTokenizer():

    def __init__(self, data, max_words=None, max_len=1000):
        self.data = data
        # oov_token=PorterStemmer
        self.tokenizer = Tokenizer(
            num_words=max_words, split=',')
        self.tokenizer.fit_on_texts(self.data)
        
        self.input_dim = len(self.tokenizer.word_index)
        self.input_len = max_len

    def tokenize(self, data):
        sequences = self.tokenizer.texts_to_sequences(data)
        # print(sequences)
        sequences_matrix = sequence.pad_sequences(
            sequences, maxlen=self.input_len)
        return sequences_matrix

    def oneHotEncoding(self, data):
        onehot = self.tokenizer.texts_to_matrix(data)
        return onehot[:, 1:]

    def tfidfEncoding(self, data):
        tfidf = self.tokenizer.texts_to_matrix(data, mode='tfidf')
        return tfidf[:, 1:]

    def getVocabDim(self):
        return self.input_dim, self.input_len

In [2]:
def data_discrepancy(data, sortByKey = ['pageTags'], groupByKey = ['pageArticleID', 'pageTags']
                     , indexLevel=['pageTags', 'pageArticleID', 'isPaidContent']):
    
    d = data.sort_values(by=sortByKey).groupby(groupByKey, sort=False, as_index=False) \
        .agg({'isPaidContent': pd.Series.nunique})
    print('-----------------------------------------------------------------------------------------------------------\n')
    print('-------------- Duplicate Article ID and Page tag combination but different isPaidContent -----------------')
    print(d[d.isPaidContent > 1].count().pageArticleID)
    print('-----------------------------------------------------------------------------------------------------------\n')
    display(data[data.pageArticleID.isin(d[d.isPaidContent > 1].pageArticleID)].sort_values(by=['pageArticleID']).head(2))
    print('-----------------------------------------------------------------------------------------------------------\n')
    print('-------------- Same Article ID is present more than 2 times -----------------')
    print(data.pageArticleID.value_counts().head(5))
    print('------------------------------------------------------\n')
    print('-------------- Max repeated article id -----------------\n')
    display(data[data.pageArticleID == d.pageArticleID[0]])

    dp = data.sort_values(by=sortByKey).set_index(indexLevel)
    print('\n-------------- Duplicate pagetags/Article ID -----------------\n')
    display(dp.head(2))
    print('-----------------------------------------------------------------------------------------------------------\n')
    print('-------------------------- Multindex value count --------------------------------------------------------')
    print(dp.index.value_counts().head(1))
    print('-------------------------- Total count of Article ID --------------------------------------------------------')
    total_count = data.count().pageArticleID
    print(total_count)
    print('-------------- Count of Unique Article ID --------------------------------')
    #print(dp[dp.index.get_level_values(level = 'pageArticleID').duplicated()])
    unique_articles = dp.index.get_level_values(level = 'pageArticleID').nunique()
    print(unique_articles)
    print('-------------- Total Duplicate Article ID -----------------')
    print( total_count -  unique_articles)

In [42]:
from sklearn.metrics import mean_squared_error

def print_words(weight):
    print('top keywords for separation')
    print('-----------------------------')
    w_dict = {}
    for i in range(0, weight.shape[0]):
        if weight[i] > 0:
            w_dict[tok.tokenizer.index_word[i+1]] = weight[i]

    for k in sorted(w_dict, key=w_dict.get, reverse=True):
        print(k, float(w_dict[k]))


def rms(y, y_hat):
    rmse = np.sqrt(mean_squared_error(y, y_hat))
    print('Root Mean squre error is : ', rmse)
    
def metric_score(y, y_hat, type='Test'):
    
    tp = np.sum(np.logical_and(y == 1, y_hat == 1))
    tn = np.sum(np.logical_and(y == 0, y_hat == 0))
    fp = np.sum(np.logical_and(y == 0, y_hat == 1))
    fn = np.sum(np.logical_and(y == 1, y_hat == 0))
    
    accuracy = np.divide(np.sum(tp + tn), np.sum(tp + tn + fp + fn))
    precision = np.divide(tp,np.sum(tp + fp))
    recall = np.divide(tp,np.sum(tp + fn))
    f1 = f1_score(y, y_hat)
    
    print('\n-----------', type ,'metric score------------------\n')
    print('True positives : ', tp)
    print('True negatives : ', tn)
    print('False positives : ', fp)
    print('False negatives : ', fn)
    print('precision : ', precision)
    print('recall : ', recall)
    print('Accuracy : ', accuracy)
    print('f1 Score : ', f1)
    print('\n------------------------------------------\n')
    
    df = pd.DataFrame({ type : {'true-postive' : tp,
                        'true-negative' : tn,
                        'false-positive' : fp,
                        'false-negative' : fn,
                        'accuracy' : accuracy,
                        'precision' : precision,
                        'recall' : recall,
                        'f1 Score' : f1}})
    return df

In [None]:
import time
import threading
import functools

def provide_progress_bar(function, estimated_time, tstep=0.2, tqdm_kwargs={}, args=[], kwargs={}):
    """Tqdm wrapper for a long-running function

    args:
        function - function to run
        estimated_time - how long you expect the function to take
        tstep - time delta (seconds) for progress bar updates
        tqdm_kwargs - kwargs to construct the progress bar
        args - args to pass to the function
        kwargs - keyword args to pass to the function
    ret:
        function(*args, **kwargs)
    """
    ret = [None]  # Mutable var so the function can store its return value
    def myrunner(function, ret, *args, **kwargs):
        ret[0] = function(*args, **kwargs)

    thread = threading.Thread(target=myrunner, args=(function, ret) + tuple(args), kwargs=kwargs)
    pbar = tqdm(total=estimated_time, **tqdm_kwargs)

    thread.start()
    while thread.is_alive():
        thread.join(timeout=tstep)
        pbar.update(tstep)
    pbar.close()
    return ret[0]

In [None]:
result_folder = 'Results'


def save_results_csv(filename, data, mode=None):
    f = '{}/{}.csv'.format(result_folder, filename)
    if mode is None:
        data.to_csv(f)
    else:
        data.to_csv(f, mode=mode)

def save_results_html(filename, data):
    f = '{}/{}.html'.format(result_folder, filename)
    #Ploting
    plot_metric(data, f)
    data.to_html(f, col_space=15)
    
def plot_metric(data, filename):
    
    df_plot = pd.concat([data['Score']['Training'], data['Score']['Test']], axis=1)
    df_plot['metric'] = df_plot.index
    df_plot = df_plot[(df_plot['metric'] == 'precision') | (df_plot['metric'] == 'recall') | (df_plot['metric'] == 'accuracy') | (df_plot['metric'] == 'f1 Score')]

    ax = df_plot.plot(kind='bar',x='metric',y=['Training','Test'], color=['blue', 'red'], figsize=(20,7.5))
    ax.set_title('best model')
    
    x_length = 0
    for i, r in df_plot.iterrows():
        ax.text(x=x_length - 0.20, y=r['Training'] + 0.01,  s = "{0:.3f}".format(r['Training']), color='blue', fontweight='bold')
        ax.text(x=x_length + 0.1, y=r['Test'] + 0.01,  s = "{0:.3f}".format(r['Test']), color='red', fontweight='bold')
        x_length = x_length + 1
    
    df_model = data['Model']['best_params']
    title = df_model[df_model.index == 'model']
    
    ax.set_title('Best Model : {}'.format(title.values))
    
    fig = ax.get_figure()
    fig.savefig(filename.replace('html', 'png'))
    

In [2]:
from tqdm import tqdm

def plot_boundary(X, Y, model, filename=None, h=0.1):
    # create a mesh to plot in
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    x1, x2 = np.meshgrid(np.arange(x1_min, x1_max, h),
                         np.arange(x2_min, x2_max, h))
    #print(x1.shape, x2.shape)

    Z = np.zeros(x1.shape)
    print(Z.shape)
    for i in tqdm(range(0, x1.shape[1])):
        X_ = np.c_[x1[:, i].ravel(), x2[:, i].ravel()]
        yy= model.predict(X_)
        
        if yy.ndim == 2:
            yy = yy.reshape(x1.shape[0])
            
        Z[:, i] = yy

    #print(xx.shape, yy.shape, Z)
    # Put the result into a color plot
    fig = plt.figure(figsize=(20, 5))
    plt.ion()
    plt.contour(x1, x2, Z, cmap=plt.cm.coolwarm, alpha=1)

    # Plot also the training points
    plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.coolwarm)
    plt.xlim(x1.min(), x1.max())
    plt.ylim(x2.min(), x2.max())
    
    if filename is not None:
        file = '{}/{}-boundary.png'.format(result_folder, filename)
        fig.savefig(file)
        
    plt.draw()

# pause = False

# def onClick(event):
#     global pause
#     pause ^= True
    
# Animated class to draw score.
class AnimateScores():
    
    def __init__(self, iterations):
        
        self.itr = iterations
        
        fig = plt.figure(figsize=(20, 5))
        
        ax = plt.axes(xlim=(0, iterations+1))
        ax.set_ylabel('Score')
        ax.set_xlabel('Iterations')
        
        self.fig = fig
        self.plt = plt
        self.ax = ax
        
        tl,  = self.plt.plot([],[], color='blue', marker='o', label='Training Score')
        vl,  = self.plt.plot([],[], color='red', marker='+', label='Validation Score')
        
        ax.legend(shadow=True, fontsize='x-large')
        
        #self.plt.ion()
        self.tl = tl
        self.vl = vl
        
        self.training_scores = []
        self.validation_scores = []
        self.counts = []

        
    def plotScores(self,training_scores, validation_scores):
        self.anim = animation.FuncAnimation(self.fig, self.update_line, 
                                       frames=range(0, self.itr),
                                       fargs=(training_scores, validation_scores),
                                       interval=self.itr**2, blit=True, init_func=self.init, repeat=True)
        
        
    def plotScoresRealtime(self,frames):
#         self.fig.canvas.mpl_connect('button_press_event', onClick)
        self.anim = animation.FuncAnimation(self.fig, self.update_line_realtime, 
                                       frames=frames,
                                       blit=True, init_func=self.init, repeat=True)
        
        self.plt.pause(self.itr)
#         self.plt.close()
#         self.plt.show(block=True)
        

    def init(self):
        self.tl.set_data([], [])
        self.vl.set_data([], [])
        return self.tl,self.vl,
    
    def update_line(self, num, tScore, vScore):
        self.tl.set_data(tScore[::, :num])
        self.vl.set_data(vScore[::, :num])
        return self.tl,self.vl,

    def update_line_realtime(self, frame):
        self.training_scores.append(frame[0])
        self.validation_scores.append(frame[1])
        self.counts.append(frame[2])
        
        self.tl.set_data(self.counts, self.training_scores)
        self.vl.set_data(self.counts, self.validation_scores)
        return self.tl,self.vl,
    
    def save(self, filename=None):
        if filename is not None:
            f = '{}/{}.gif'.format(result_folder, filename)
            print(".... Saving aimation {}".format(f))
            self.anim.save(f, writer = "pillow", fps=5) 
    
#Static graph to plot scores
def plot_scores(iterations, training_scores, validation_scores, file = None):
    
        fig = plt.figure(figsize=(20, 5))
        ax = plt.axes(xlim=(0, iterations + 1))
    
        ax.set_ylabel('Score')
        ax.set_xlabel('Iterations')
    
        plt.plot(range(1, iterations+1),training_scores, color='blue', marker='o', label='Training Score')
        plt.plot(range(1, iterations+1),validation_scores, color='red', marker='+', label='Validation Score')

        ax.legend(shadow=True, fontsize='x-large')
        
        if file is not None:
            f = '{}/{}.png'.format(result_folder, file)
            fig.savefig(f)
        
        plt.show()