In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import json
import nltk
import re
import csv
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [1]:
x_train= pd.read_csv('../input/radix-challenge/train.csv')
x_test=pd.read_csv('../input/radix-challenge/test.csv')

In [1]:
x_train

In [1]:
len(pd.unique(x_train.genres))

## DATA PREPARATION 

In [1]:
# function for text cleaning 
def clean_text(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    text = text.lower() 
    
    return text

x_train['clean_plot'] = x_train['synopsis'].apply(lambda x: clean_text(x))
x_test['clean_plot'] = x_test['synopsis'].apply(lambda x: clean_text(x))


In [1]:
#function to remove all the stopwords that may affects the prestation of the model
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))


def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

x_train['clean_plot'] = x_train['clean_plot'].apply(lambda x: remove_stopwords(x))

In [1]:
#estrapolate the genres vector from the train dataset


#multilabel
gen = [x.split(' ') for x in list (x_train['genres'])]    
    #multiclass
#gen1 = [[x] for x in list(x_train['genres'])]#

In [1]:
gen

In [1]:
set(x for l in gen for x in l)
#there are in total 19 different categories

In [1]:
#apply the onehot transformation for the genres vector

from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
y=multilabel_binarizer.fit_transform(gen)


In [1]:
y.shape
#this is perfect, cause it takes number of rows x number of different categories

In [1]:
#using the 10k most frequent words in the synopsis through the Tf-idf features 

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)

In [1]:
# applying TF-IDF features to the synopsis
xtrain_tfidf = tfidf_vectorizer.fit_transform(x_train['clean_plot'])

In [1]:
from sklearn.linear_model import LogisticRegression

# Binary Relevance
from sklearn.multiclass import OneVsRestClassifier

# Performance metric
from sklearn.metrics import f1_score

## THE MODEL

In [1]:
lr = LogisticRegression()
clf = OneVsRestClassifier(lr)

In [1]:
# fit model on train data
clf.fit(xtrain_tfidf,y)

In [1]:
y_pred = clf.predict(tfidf_vectorizer.transform(x_test['clean_plot']))

In [1]:
y_pred.shape

In [1]:
print(len(multilabel_binarizer.inverse_transform(y_pred)))
pred_gen = multilabel_binarizer.inverse_transform(y_pred)

In [1]:
submission = pd.DataFrame(data= {'movie_id':x_test.movie_id,'predicted_genres':pred_gen})

In [1]:
for i in range(0,len(submission.predicted_genres)):
   submission.predicted_genres[i] =(','.join((submission.predicted_genres[i])))

In [1]:
for i in range(0,len(submission.predicted_genres)):
    submission.predicted_genres[i] = submission.predicted_genres[i].replace(","," ")

In [1]:
submission.to_csv('submission.csv',index=False)

the model does not predict the genres for all the movies, maybe should be a good idea implement a multiclass model instead of multilabel, another option could be change the model, using something else instead of Onevstherest classifier.