In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from ast import literal_eval
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

## Reda Data

In [None]:
train_data = pd.read_csv('data/train.tsv', sep='\t')
validation_data = pd.read_csv('data/validation.tsv', sep='\t')
test_data = pd.read_csv('data/test.tsv', sep='\t')

In [None]:
print (train_data.shape)
print (validation_data.shape)
print (test_data.shape)

In [None]:
train_data.head()

## Convert tags into list

In [None]:
train_data.tags = train_data.tags.apply(literal_eval)
validation_data.tags = validation_data.tags.apply(literal_eval)

In [None]:
train_data.head()

In [None]:
X_train, y_train = train_data.title.values, train_data.tags.values
X_val, y_val = validation_data.title.values, validation_data.tags.values
X_test= test_data.title.values

## Tokenize a sentence into list of words

In [None]:
def tokenize(text):
    tokenize_text = word_tokenize(text)
    return tokenize_text

In [None]:
s = "How to draw a stacked dotplot in R?"
tokenize(s)

## Convert text to lowercase

In [None]:
def toLowerCase(text):
    return text.lower()

In [None]:
s = "How to draw a stacked dotplot in R?"
toLowerCase(s)

## Build Vocaulary for X

In [None]:
def buildVocabularyX(list_of_sentences):
    data = list_of_sentences
    words_frequency = {}
    for sentence in data:
        sentence = toLowerCase(sentence)
        list_of_words = tokenize(sentence)
        for word in list_of_words:
            if word in words_frequency:
                words_frequency[word]+=1
            else:
                words_frequency[word]=1
    return words_frequency

In [None]:
data = ["How to draw a stacked dotplot in R?", "Hi man, how have you been?"]
buildVocabularyX(data)

## Build vocabulary for X_train

In [None]:
XVocabulary = buildVocabularyX(X_train)

In [None]:
# write vocabulary counts to file
with open('XVocabulary.csv', 'w', encoding="utf-8", newline='') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in XVocabulary.items():
       writer.writerow([key, value])

## Build Vocabulary for y

In [None]:
def buildVocabularyY(data):
    tagFrequency = {}
    for y in data:
        for tag in y:
            tag = toLowerCase(tag)
            if tag in tagFrequency:
                tagFrequency[tag]+=1
            else:
                tagFrequency[tag]=1
    return tagFrequency

In [None]:
data = [ ['php', 'mysql'], ['javascript', 'jquery'], ['php']]
buildVocabularyY(data)

## Build Vocabulary for y_train

In [None]:
YVocabulary = buildVocabularyY(y_train)

In [None]:
# write vocabulary counts to file
with open('YVocabulary.csv', 'w', encoding="utf-8", newline='') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in YVocabulary.items():
       writer.writerow([key, value])

## Convert text to Bag of words representation

In [None]:
def toBagOfWords(listOfData):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(listOfData)
    return X, vectorizer

In [None]:
data_corpus = ["John likes to watch movies. Mary likes movies too.", 
               "John also likes to watch football games."]

bagOfWords_data_corpus, vectorizer = toBagOfWords(data_corpus)
print (bagOfWords_data_corpus.shape)
print (vectorizer.get_feature_names())
print (bagOfWords_data_corpus.toarray())


In [None]:
vectorizer = CountVectorizer()
bagOfWords_X_train = vectorizer.fit_transform(X_train)
bagOfWords_X_val = vectorizer.transform(X_val)
bagOfWords_X_test = vectorizer.transform(X_test)

In [None]:
bagOfWords_X_train.shape

## TfIDF Representation

In [None]:
tfidf_vectorizer = TfidfVectorizer()

tfidf_X_train = tfidf_vectorizer.fit_transform(X_train) 
tfidf_X_val = tfidf_vectorizer.transform(X_val)
tfidf_X_test = tfidf_vectorizer.transform(X_test)

In [None]:
tfidf_X_train.shape

## Transform label into classes

In [None]:
tags_classes = sorted(YVocabulary.keys())
tags_classes[0:10]

In [None]:
mlb = MultiLabelBinarizer(classes=tags_classes)
y_train = mlb.fit_transform(y_train)
y_val = mlb.fit_transform(y_val)

In [None]:
y_train[0]

## Build Model

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier

In [None]:
def trainClassifier(X_train, Y_train):
    logisticRegessionClassifier = LogisticRegression(C=1.0, penalty='l2')
    ovrclassifier = OneVsRestClassifier(logisticRegessionClassifier)
    ovrclassifier.fit(X_train, y_train)
    return ovrclassifier
    

In [None]:
bagOfWords_classifier = trainClassifier(bagOfWords_X_train, y_train)
tfidf_classifier = trainClassifier(tfidf_X_train, y_train)

## Evaluation

In [None]:
bagOfWords_y_predicted = bagOfWords_classifier.predict(bagOfWords_X_val)
bagOfWords_y_predicted_scores = bagOfWords_classifier.decision_function(bagOfWords_X_val)

In [None]:
bagOfWords_y_predicted

In [None]:
bagOfWords_y_predicted_scores

In [None]:
tfidf_y_predicted = tfidf_classifier.predict(tfidf_X_val)
tfidf_y_predicted_scores = tfidf_classifier.decision_function(tfidf_X_val)

In [None]:
tfidf_y_predicted

In [None]:
tfidf_y_predicted_scores

### TFIDF Evaluation

In [None]:
y_val_pred_inversed = mlb.inverse_transform(tfidf_y_predicted)
y_val_inversed = mlb.inverse_transform(y_val)
for i in range(2,4):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_val[i],
        ','.join(y_val_inversed[i]),
        ','.join(y_val_pred_inversed[i])
    ))

## save Model

In [None]:
import pickle
filename = 'bagOfWords_model.sav'
pickle.dump(bagOfWords_classifier, open(filename, 'wb'))

# Load saved model 

In [None]:
filename = 'bagOfWords_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.predict(bagOfWords_X_test)
result