In [17]:
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

df = pd.read_csv('data.csv')
df = df[pd.notnull(df['text'])]
df = df[pd.notnull(df['tag'])]
df = df.reset_index()


In [23]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))


def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text  # HTML decoding
    text = text.lower()  # lowercase text
    # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    # delete symbols which are in BAD_SYMBOLS_RE from text
    text = BAD_SYMBOLS_RE.sub('', text)
    # delete stopwors from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text


df['text'] = df['text'].apply(clean_text)


FeatureNotFound: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?

In [24]:
from itertools import islice
from gensim.models import Word2Vec
wv = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)
wv.init_sims(replace=True)



FileNotFoundError: [Errno 2] No such file or directory: 'GoogleNews-vectors-negative300.bin.gz'

In [4]:
s = set(wv.index_to_key)
def word_averaging(wv, words):
    all_words, mean = set(), []
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in s:
            mean.append(wv.vectors[wv.key_to_index[word]])
            all_words.add(wv.key_to_index[word])

    if not mean:
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(
        np.array(mean).mean(axis=0)).astype(np.float32)
    return mean


def word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, text) for text in text_list])


In [5]:
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens
    
train, test = train_test_split(df, test_size=0.3, random_state = 42)
test_tokenized = test.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values
train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values
X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)

In [6]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5, max_iter=1000000)
logreg = logreg.fit(X_train_word_average, train['tag'])
y_pred = logreg.predict(X_test_word_average)
score = accuracy_score(y_pred, test['tag'])



In [7]:
def getTop3(text):
    processedInput = word_averaging(wv, w2v_tokenize_text(text))
    prob = logreg.predict_proba([processedInput])
    prob = prob[0]
    labels = logreg.classes_
    res = {}
    for i, p in enumerate(prob):
        res[p] = labels[i]
    prob.sort()
    ans = []
    keys = prob[-3:]
    for i in keys:
        ans.append([res[i], i])
    return ans

In [11]:
getTop3("laptop")

[['General & Admin Expenses', 0.05165550116567915],
 ['Revenue', 0.21393953507864527],
 ['Machinery & Equipment', 0.6769222315467206]]

In [8]:
print(getTop3("Computer"))

[['Cash', 0.00030132179472606416], ['Revenue', 0.01020466634778689], ['Machinery & Equipment', 0.9890809778850956]]


In [11]:
import joblib
joblib_file = "ML.joblib"
joblib.dump(getTop3, joblib_file)

['ML.joblib']