In [1]:
import nltk, re
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import decomposition, ensemble
from nltk.corpus import stopwords  
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
movie_data = pd.read_csv("movies.csv")

def split_genres(genres):
    return genres.replace(" ", "").split(",")
movie_data["Genre"] = movie_data["Genre"].apply(split_genres)
movie_data["Description"][0]

'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.'

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(movie_data.Genre)
Y = multilabel_binarizer.transform(movie_data.Genre)

count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(movie_data.Description)

tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

In [4]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=9000)
X_tfidf_resampled, Y_tfidf_resampled = ros.fit_sample(X_tfidf, Y)




In [5]:
x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf_resampled, Y_tfidf_resampled, test_size=0.2, random_state=9000)


In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import hamming_loss

def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    '''
    Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
    http://stackoverflow.com/q/32239577/395857
    '''
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/float(len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

def print_score(y_pred, clf):
    print("Hamming loss: {}".format(hamming_loss(y_pred, y_test_tfidf)))
    print("Hamming score: {}".format(hamming_score(y_pred, y_test_tfidf)))
    print("---")    

In [7]:
nb_clf = MultinomialNB()
sgd = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=100, tol=None)
lr = LogisticRegression(solver="lbfgs")

for classifier in [nb_clf, sgd, lr]:
    print("model  : ", classifier.__class__.__name__)
    clf = OneVsRestClassifier(classifier)
    clf.fit(x_train_tfidf, y_train_tfidf)
    y_pred = clf.predict(x_test_tfidf)
    print_score(y_pred, classifier)

model  :  MultinomialNB
Hamming loss: 0.01848807506512557
Hamming score: 0.6587456496730885
---
model  :  SGDClassifier
Hamming loss: 0.027952148511596147
Hamming score: 0.4691975012013455
---
model  :  LogisticRegression
Hamming loss: 0.01486633450517211
Hamming score: 0.7232580490148967
---


In [8]:
multilabel_binarizer.classes_

array(['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime',
       'Drama', 'Family', 'Fantasy', 'FilmNoir', 'History', 'Horror',
       'Music', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Sport',
       'Thriller', 'War', 'Western'], dtype=object)

In [9]:
new_data = movie_data[["Description","Genre"]]
new_data
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(new_data.Genre)
Y = multilabel_binarizer.transform(new_data.Genre)
x = new_data["Description"]


In [10]:
import pickle
with open("glove.6B.200d.pkl", 'rb') as f:
    word_vector = pickle.load(f)


In [None]:
from scipy.spatial import distance
import scipy.stats as ss
    
tf = TfidfVectorizer( analyzer='word',min_df = 0, stop_words = 'english', sublinear_tf=True)
tfidf_matrix =  tf.fit_transform(x)
feature_names = tf.get_feature_names()
x_v = []
for docId in range(len(x)):
    doc = docId
    feature_index = tfidf_matrix[doc,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix[doc, x] for x in feature_index])
    docWector = []
    wordNos = 0
    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
        if w in word_vector:
            wordNos+=1
            docWector.append(word_vector[w]*s)
    docWector = sum(docWector)/len(docWector)
    x_v.append(docWector)

from imblearn.over_sampling import RandomOverSampler
smote_tomek = RandomOverSampler(random_state=9000)
X_resampled, Y_resampled = smote_tomek.fit_sample(x_v, Y)

_, X_new, _, y_new = train_test_split(X_resampled, Y_resampled, test_size=0.20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size=0.20, random_state=42)

    
def mlknn(train_data, train_label, test_data, k):
    test_labels = []
    for test in test_data:
        distances = [ distance.cosine(test,x) for x in train_data]
        ranking = ss.rankdata(distances, method='ordinal')
        firstKindexes = [np.where(x == ranking) for x in range(1,k+1)]
        allLabels = sum([train_label[index[0][0]] for index in firstKindexes])
        pickLabels = allLabels/sum(allLabels)
        maxLabels = np.where(max(pickLabels) == pickLabels )[0]
        test_label = [0 for x in pickLabels]
        for i in maxLabels:
            test_label[i] = 1
        test_labels.append(test_label)
    return test_labels
        


In [None]:
for i in range(1,6):
    test_return = mlknn(X_train, y_train, X_test,i)
    print("Hamming score for k ={} : {}".format(i,hamming_score(np.array(test_return), y_test)))

Hamming score for k =1 : 0.8226814031715521
Hamming score for k =2 : 0.7864007688611244
Hamming score for k =3 : 0.7728656094826206


In [30]:
X_train.shape

(8324, 200)