In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
from sklearn.naive_bayes import MultinomialNB 
from sklearn.pipeline import Pipeline
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization
import matplotlib.pyplot as plt # a collection of commands for making changes to plots
import csv, json, nltk, re # file handling module and natural language toolkit for text manipulation
from nltk.corpus import stopwords # we need to use list of words ltr  
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import SGDClassifier
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

# Any results you write to the current directory are saved as output.

In [None]:
train_data = pd.read_csv("../input/train.csv")
test_data = pd.read_csv("../input/test.csv")
train_data.head(10)

In [None]:
# we cant see any movie that has a false value. False means it has no string text, but lets remove any movies
# with no genres just in case

train_data = train_data[~(train_data['genres'].str.len()==0)]
train_data.shape

In [None]:
# maybe it is good idea to visualize the number of genres we have in our trining dataset by 
#counting their frequency
allGenres = train_data['genres']
allGenres = nltk.FreqDist(allGenres)

In [None]:
allGenres

In [None]:
# we can create a data frame that have two columns [Genre, Frequency]
genres_df = pd.DataFrame({'Genre':list(allGenres.keys()),'Frequency':list(allGenres.values())})
genres_df.head(6)

In [None]:
# lets just visualize the most frequent 40 genres but you can visualize more
genres_df40 = genres_df.nlargest(columns='Frequency', n=40)

In [None]:
plt.figure(figsize = (15,10))
ax = sns.barplot(data =genres_df40, x ='Frequency', y ='Genre')
ax.set(xlabel = 'Frequency')
ax.set(ylabel = 'Genres')
plt.show()

In [None]:
# lets look at the synopsis just to see if we have noisy or text that wont contribute much to learn our model
synopsisData = train_data['synopsis']

In [None]:
# split the text for later prediction 
def splitString(textlist):

    splitedtext = textlist.split(' ')
    return splitedtext

In [None]:
# Ok, seems we can clean the synopsis by removing dots, commas or any underscore or backslah, chagnge letter
#case for many reasons first stop words are in lower case,  etc

#create a function so we can also use it more often
def clean_synopsis(synopsis_text):
    # remove backslash
    synopsis_text = re.sub("\'","", synopsis_text)
    # remove everything except string text
    synopsis_text = re.sub("[^a-zA-Z]"," ", synopsis_text)
    # remove white spaces in the synopsis
    synopsis_text = ' '.join(synopsis_text.split())
    # change all the text string to lower case
    synopsis_text = synopsis_text.lower()
    # return the clean synopsis
    return synopsis_text

In [None]:
# we can create a  similiar fcuntion as clean_synopsis and then use lambda
# time to remove stop words because they not meaningful for our predictor. Thanks to nltk,
stop_words = set(stopwords.words("english"))
def stopword_removal(synopsis_text):
    # use list comprehension
    synopsis_text = [t for t in synopsis_text.split() if not t in stop_words]
    # remove white spaces after stop words removed
    synopsis_text = ' '.join(synopsis_text)
    return synopsis_text

In [None]:
def synopsis_analysis(movie_data):
    """ First we remove backslah,
    lowercase the letters, etc.  second
    we remove stop words from the synopsis,
     we use lambda function
     input: a data frame with synopsis column
     output: clean data"""
    movie_data['synopsis_clean'] = movie_data['synopsis'].apply(lambda x: clean_synopsis(x))
    movie_data['synopsis_clean'] = movie_data['synopsis_clean'].apply(lambda x: stopword_removal(x))

    '''if 'genres' in movie_data.columns:

        movie_data['new_genre'] = movie_data['genres'].apply(lambda x: splitString(x))'''

    return movie_data


In [None]:
train_data_new = synopsis_analysis(train_data)

Now we can construct our Pipeline to build a model from the training data and labels.

In [None]:

#tfidf_vectorizer = TfidfVectorizer()
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(train_data_new['synopsis'], train_data_new['genres'])

Let's use the movie synopsis to see how our learned model make the prediction

In [None]:
prediction = nb.predict(test_data['synopsis'])

Save our prediction to the output directory

In [None]:
submission = pd.DataFrame({'movie_id': test_data['movie_id'], 'predicted_genres': prediction})
filename = 'Movie Genre Predictions NB 1.csv'

submission.to_csv(filename, index=False)

In [None]:
submission.head

**Support Vector Machine **

Lets try to build a model using support vector machine. Perhaphs we get a better prediction

In [None]:
sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(train_data_new['synopsis'], train_data_new['genres'])

In [None]:
predictionsgd = sgd.predict(test_data['synopsis'])

In [None]:
submission = pd.DataFrame({'movie_id': test_data['movie_id'], 'predicted_genres': predictionsgd})
filename = 'Movie Genre Predictions SVM 1.csv'

submission.to_csv(filename, index=False)

Lets check the first 5 rows of our movies

In [None]:
submission.head()

Here we can binarize our target which is a movie genres if you want to use logistic regression

In [None]:
# looks like we are mostly done with data preparation and exploration. Just need a few touches like encoding
# the target variables with MultiLabelBinarizer(). Our target is the movie genres
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(train_data_new['new_genre'])

# transform target variable
y = multilabel_binarizer.transform(train_data_new['new_genre'])
trainDataVectorized = tfidf_vectorizer.fit_transform(train_data_new['synopsis'])

We need to use term frequency-inverse document frequency vectorization. The code below is data preparation for logistic regression algorithm

In [None]:
# we can create a TF-IDF vectorization
trainDataVectorized = tfidf_vectorizer.fit_transform(trainData)

In [None]:
# Now we can build our model
from sklearn.linear_model import LogisticRegression

# Binary Relevance
from sklearn.multiclass import OneVsRestClassifier

# Performance metric
from sklearn.metrics import f1_score

In [None]:
logregressor = LogisticRegression()
clf = OneVsRestClassifier(logregressor)

In [None]:
# Now we can fit the model on the training data
clf.fit(trainDataVectorized,trainLabels)

In [None]:
# now we can make a prediction on the validation set
yPredictor = clf.predict(valDataVectorized)
yPredictor[0]