In [None]:
import pandas as pd
import numpy as np
import gensim.utils as utils

###### Create dataframe 

In [None]:
columns = ['index','text', 'tag']
data = pd.DataFrame( columns=columns)
data


###### Read data

In [None]:
def add_row(text, classes, df):
    df.loc[len(df)] = [len(df), utils.to_utf8(text, errors='replace').decode("utf8"), classes]

In [None]:
import textract
from os import listdir
from os.path import isfile, join

dirpath = 'data/train/shortstory/'
shortstory = [add_row(textract.process(dirpath + f),'shortstory', data) for f in listdir(dirpath) if isfile(join(dirpath, f))]

dirpath = 'data/train/java/'
java = [add_row(textract.process(dirpath + f), 'technical', data) for f in listdir(dirpath) if isfile(join(dirpath, f))]

dirpath = 'data/train/python/'
python = [add_row(textract.process(dirpath + f),'technical', data) for f in listdir(dirpath) if isfile(join(dirpath, f))]

dirpath = 'data/train/medicine/'
medicine = [add_row(textract.process(dirpath + f),'medicine', data) for f in listdir(dirpath) if isfile(join(dirpath, f))]

dirpath = 'data/train/mobilescreen/'
mobilescreen = [add_row(textract.process(dirpath + f),'mobilescreen', data) for f in listdir(dirpath) if isfile(join(dirpath, f))]

dirpath = 'data/train/mobilememorycard/'
mobilememorycard = [add_row(textract.process(dirpath + f),'mobilememorycard', data) for f in listdir(dirpath) if isfile(join(dirpath, f))]

dirpath = 'data/train/lcd/'
mobilememorycard = [add_row(textract.process(dirpath + f),'lcdscreen', data) for f in listdir(dirpath) if isfile(join(dirpath, f))]

dirpath = 'data/train/hippa/'
mobilememorycard = [add_row(textract.process(dirpath + f),'hippa', data) for f in listdir(dirpath) if isfile(join(dirpath, f))]



Convert Tags to array

In [None]:
data['tags1'] = [ [x] for x in data.tag]

In [None]:
type(data.tags1)

In [None]:
import os
import time
import string
import pickle
import string

from operator import itemgetter

from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import pos_tag
from stemming.porter2 import stem

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, LabelBinarizer
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report as clsr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split as tts

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from stemming.porter2 import stem
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer


lemma = WordNetLemmatizer()
cachedStopWords = stopwords.words("english")
def review_to_wordlist( text, remove_stopwords=True ):
    review_text = re.sub("[^a-zA-Z]"," ", text)
    words = review_text.lower().split()
    if remove_stopwords:
        words = [w for w in words if not w in cachedStopWords]
    words = (list(map(lambda token: stem(lemma.lemmatize(token)),words)))
    
    return(words)


In [None]:
import nltk.data

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
import string

printable = set(string.printable)

def review_to_sentences( text, remove_stopwords=True ):
    text = filter(lambda x: x in printable, text.strip())
    raw_sentences = tokenizer.tokenize(text)
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.extend( review_to_wordlist( raw_sentence, remove_stopwords ))
    return sentences

In [None]:
def timeit(func):
    """
    Simple timing decorator
    """
    def wrapper(*args, **kwargs):
        start  = time.time()
        result = func(*args, **kwargs)
        delta  = time.time() - start
        return result, delta
    return wrapper


def identity(arg):
    """
    Simple identity function works as a passthrough.
    """
    return arg


In [None]:
@timeit
def build_and_evaluate(X, y, classifier=SGDClassifier, outpath=None, verbose=True):

    @timeit
    def build(classifier, X, y=None):
        stop_words = stopwords.words("english")
        if isinstance(classifier, type):
            classifier = OneVsSGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.9,
                    eta0=0.2, fit_intercept=True, l1_ratio=0,
                    learning_rate='optimal', loss='modified_huber', n_iter=10, n_jobs=1,
                    penalty='elasticnet', power_t=0.6, random_state=None, shuffle=True,
                    verbose=0, warm_start=False)

        model = Pipeline([
            ('preprocessor', None),
            ('vectorizer', TfidfVectorizer(tokenizer=review_to_sentences, stop_words=stop_words, lowercase=False)),
            ('classifier', classifier),
        ])

        model.fit(X, y)
        return model

#    labels = LabelEncoder()
    labels = MultiLabelBinarizer()
    y = labels.fit_transform(y)

    if verbose: print("Building for evaluation")
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=42)
    model, secs = build(classifier, X_train, y_train)

    if verbose: print("Evaluation model fit in {:0.3f} seconds".format(secs))
    if verbose: print("Classification Report:\n")

    y_pred = model.predict(X_test)

    print(clsr(y_test, y_pred, target_names=labels.classes_))
    print('y_test: ', y_test)
    print('value :' , labels.inverse_transform(y_test))
    print('y_pred: ', y_pred)
    print('value :' , labels.inverse_transform(y_pred))
    if verbose: print("Building complete model and saving ...")
    model, secs = build(classifier, X, y)
    model.labels_ = labels
    
    if verbose: print("Complete model fit in {:0.3f} seconds".format(secs))

    if outpath:
        with open(outpath, 'wb') as f:
            pickle.dump(model, f)

        print("Model written out to {}".format(outpath))

    return model

In [None]:
def show_most_informative_features(model, text=None, n=20):

    # Extract the vectorizer and the classifier from the pipeline
    vectorizer = model.named_steps['vectorizer']
    classifier = model.named_steps['classifier']

    # Check to make sure that we can perform this computation
    if not hasattr(classifier, 'coef_'):
        raise TypeError(
            "Cannot compute most informative features on {} model.".format(
                classifier.__class__.__name__
            )
        )

    if text is not None:
        # Compute the coefficients for the text
        tvec = model.transform([text]).toarray()
    else:
        # Otherwise simply use the coefficients
        tvec = classifier.coef_

    # Zip the feature names with the coefs and sort
    coefs = sorted(
        zip(tvec[0], vectorizer.get_feature_names()),
        key=itemgetter(0), reverse=True
    )

    topn  = zip(coefs[:n], coefs[:-(n+1):-1])

    # Create the output string to return
    output = []

    # If text, add the predicted value to the output.
    if text is not None:
        output.append("\"{}\"".format(text))
        output.append("Classified as: {}".format(model.predict([text])))
        output.append("")

    # Create two columns with most negative and most positive features.
    for (cp, fnp), (cn, fnn) in topn:
        output.append(
            "{:0.4f}{: >15}    {:0.4f}{: >15}".format(cp, fnp, cn, fnn)
        )

    return "\n".join(output)

In [None]:
columns = ['index','text', 'tag']
test = pd.DataFrame( columns=columns)
dirpath = 'data/test/medicine/'
[add_row(textract.process(dirpath + f),'medicine', test) for f in listdir(dirpath) if isfile(join(dirpath, f))]
dirpath = 'data/test/mobilescreen/'
[add_row(textract.process(dirpath + f),'mobilescreen', test) for f in listdir(dirpath) if isfile(join(dirpath, f))]
dirpath = 'data/test/mobilememorycard/'
[add_row(textract.process(dirpath + f),'mobilememorycard', test) for f in listdir(dirpath) if isfile(join(dirpath, f))]
dirpath = 'data/test/lcd/'
[add_row(textract.process(dirpath + f),'lcdscreen', test) for f in listdir(dirpath) if isfile(join(dirpath, f))]
dirpath = 'data/test/java/'
[add_row(textract.process(dirpath + f),'technical', test) for f in listdir(dirpath) if isfile(join(dirpath, f))]
dirpath = 'data/test/hippa/'
[add_row(textract.process(dirpath + f),'hippa', test) for f in listdir(dirpath) if isfile(join(dirpath, f))]


In [None]:
test.shape

In [None]:
if __name__ == "__main__":
    PATH = "model.pickle"

    if not os.path.exists(PATH):
        # Time to build the model
        X = data.text
        y = data.tag

        model = build_and_evaluate(X,y, outpath=PATH)

    else:
        with open(PATH, 'rb') as f:
            model = pickle.load(f)
            vectorizer = model.named_steps['vectorizer']
            classifier = model.named_steps['classifier']
            Xte = vectorizer.transform(test.text)
            y_pred = classifier.predict(Xte)
            print(y_pred)
            labels = model.labels_
            print(labels.inverse_transform(y_pred))
            print(test.tag)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
stop_words = stopwords.words("english")    
vectorizer = TfidfVectorizer(tokenizer=review_to_sentences, stop_words=stop_words, lowercase=False,use_idf=True)
X = vectorizer.fit_transform(data.text.str.upper())
print(X.shape)

In [None]:
type(data.tag[0])

In [None]:
labels = LabelBinarizer()  #MultiLabelBinarizer() #LabelEncoder() #
y = labels.fit_transform(data.tag)
print(y.shape)

In [None]:
classifier = OneVsRestClassifier(SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.9,
                    eta0=0.2, fit_intercept=True, l1_ratio=0,
                    learning_rate='optimal', loss='modified_huber', n_iter=10, n_jobs=1,
                    penalty='elasticnet', power_t=0.6, random_state=None, shuffle=True,
                    verbose=0, warm_start=False))
classifier.fit(X, y)

In [None]:
Xte = vectorizer.transform(test.text)
y_pred = classifier.predict(Xte)
y_pred_prob = classifier.predict_proba(Xte)
print(y_pred)
print(y_pred_prob)
print(labels.inverse_transform(y_pred))
print(test.tag.tolist())


In [None]:
ans = pd.DataFrame({'yte': test.tag.tolist(), 'ypred' : labels.inverse_transform(y_pred)})
ans