In [49]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
import math
import glob, re, os
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences

In [50]:
data_dir = 'opensubtitles-parser/inOneFolder/'

## Before starting

In [None]:
#To delete redundant part of xml file names
for filename in glob.glob(data_dir + '*.xml'):
    new_name = re.sub('[0-9]*_[0-9]*_[0-9]*_([^.]*).xml', r'\1.xml', filename)
    os.rename(filename, new_name)

In [2]:
# converter.py
import xml.etree.ElementTree as ET

def convert_to_text(in_file):
    """
    a function that gets subtitle xml file and extract it's text
    and save it to a file with same name but '.txt' extension
    """

    # read xml file
    tree = ET.parse(in_file)
    root = tree.getroot()

    file_name = in_file.split('.')[0] #extract file name without .xml

    text = ''
    for parag in root.iter('s'):  # read each paragraph
        for word in parag.iter('w'): # read each word
            text += word.text + ' '
        text = text.rstrip()
        text += '\n'
        
    try:
        f = open(file_name + '.txt', 'w')
        f.write(text.encode('utf8'))   # to avoid encoding error
    except Exception as e:
        print 'there is an error in file: ', in_file
        print e
    else:
        f.close()

In [13]:
def readSubtitles():
    for f in os.listdir(data_dir):
        if f.endswith('.xml'):
            convert_to_text(data_dir + f)

In [17]:
readSubtitles()

## Data Preparation

In [3]:
# File Names List
fileNames = []
for fname in os.listdir(data_dir):
    fileNames.append(fname.replace("_", " ").replace(".xml", ""))

In [4]:
def modifier(s):
    s = re.sub('([^\(]*)\(.*', r'\1', s.lower()).rstrip()
    # s = s[1:] if s.startswith("\"") else s
    s = "the " + s[:s.rfind(", the")] if ", the" in s else s
    s = "an " + s[:s.rfind(", an")] if ", an" in s else s
    s = "a " + s[:s.rfind(", a")] if ", a" in s else s
    return s

In [5]:
def findFileName(row):
    return row['modifiedtitle'].replace(' ', '_') + '.xml' if row['modifiedtitle'] in fileNames else np.nan

In [47]:
def findSubtitles(row):
    if (pd.isnull(row['filename']) == False) :
        f = open(data_dir + row['filename'].replace('.xml', '.txt'), 'r')
        return f.read()
    return np.nan

In [7]:
def selectGenre(row):
    return row['genres'].split('|')[0]

In [8]:
dfmovies = pd.read_csv("./ml-20m/movies.csv")
dfmovies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
len(dfmovies)

27278

In [11]:
dfratings = pd.read_csv("./ml-20m/ratings.csv")
dfratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [12]:
len(dfratings)

20000263

In [13]:
dfratedmovies = pd.merge(dfmovies, dfratings, on='movieId')

In [14]:
dfratedmovies.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,944919407
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6,5.0,858275452
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,833981871
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,4.0,943497887
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5,1230858821


In [15]:
len(dfratedmovies)

20000263

In [44]:
dfratedmovies['filename'] = np.nan
dfratedmovies['modifiedtitle'] = np.nan

In [45]:
dfratedmovies.head()

Unnamed: 0.1,Unnamed: 0,movieId,title,genres,userId,rating,timestamp,filename,modifiedtitle
0,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,944919407,,
1,1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6,5.0,858275452,,
2,2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,833981871,,
3,3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,4.0,943497887,,
4,4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5,1230858821,,


In [46]:
# print set(dfratedmovies['title'])

In [47]:
# To fill modifiedtitle column in order to use it to easly find relevant subtitle files
dfratedmovies['modifiedtitle'] = dfratedmovies['title'].apply(modifier)

In [48]:
# To fill filename column with appropriate file names
dfratedmovies['filename'] = dfratedmovies.apply(findFileName, axis=1)

In [111]:
# Writing data frame into csv file
dfratedmovies.to_csv('ratedmovies.csv')

In [5]:
# dfratedmovies.drop('Unnamed: 0', axis=1, inplace=True)
dfratedmovies.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,filename,modifiedtitle
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,944919407,,toy story
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6,5.0,858275452,,toy story
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,833981871,,toy story
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,4.0,943497887,,toy story
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5,1230858821,,toy story


## Data Reading

In [40]:
#either call read_csv with index_col=0 odor call to_csv with index=False
dfratedmovies = pd.read_csv('./ratedmovies.csv', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [41]:
print dfratedmovies[pd.isnull(dfratedmovies['filename']) == False].shape
print dfratedmovies[pd.isnull(dfratedmovies['filename']) == True].shape

(6116353, 8)
(13883910, 8)


In [42]:
print len(dfratedmovies[pd.isnull(dfratedmovies['filename']) == False]['movieId'].unique())
print len(dfratedmovies[pd.isnull(dfratedmovies['filename']) == False]['filename'].unique())

1211
1019


In [45]:
df = dfratedmovies[pd.isnull(dfratedmovies['filename']) == False][['movieId', 'title', 'genres', 'filename',]].copy()
# df['genres'] = df.apply(selectGenre, axis=1)
df = df.drop_duplicates()
df['subtitles'] = np.nan

In [51]:
df['subtitles'] = df.apply(findSubtitles, axis=1)

In [52]:
df.head()

Unnamed: 0,movieId,title,genres,filename,subtitles
49695,2,Jumanji (1995),Adventure|Children|Fantasy,jumanji.xml,"It' s just a pack of wolves .\nCome on , we' r..."
99590,6,Heat (1995),Action|Crime|Thriller,heat.xml,"Check , charge or cash ?\nCash .\nMake it out ..."
170830,11,"American President, The (1995)",Comedy|Drama|Romance,the_american_president.xml,Liberty' s moving .\nThe 10 : 15 event' s been...
203230,16,Casino (1995),Crime|Drama,casino.xml,"[ Man Narrating ]\nWhen you love someone , you..."
241291,18,Four Rooms (1995),Comedy,four_rooms.xml,[ Children ]\nShould auld acquaintance be forg...


In [53]:
df.to_pickle('genreClassification_multiclass.pkl')

# Genre Classification

In [2]:
df = pd.read_pickle('genreClassification.pkl')

In [3]:
X = df.drop('genres', axis=1)

In [4]:
y = df['genres']

In [5]:
# Data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [6]:
print X_train.shape
print y_train.shape
print X_test.shape
print y_test.shape

(811, 4)
(811,)
(400, 4)
(400,)


In [7]:
print X_train['subtitles'].ix[14219196]

You wouldn' t have a match ... by any chance , would you ?
Oh , oh , oh !
The airgot to it .
The air got to it !
Come on , come on .
Come on .
[ Gasping ]
I did it .
I did it .
Fire !
Thereyougo !
Light it up !
Come on !
# The time to hesitate is through #
Ouch !
[ Laughing ]
Ouch !
# No time to wallow in the mire #
# Try now , we can only lose #
# ' Cause girl we couldn' t get much higher #
# Come on , baby light my fire ##
Here you go !
Here you go !
It' s a signal fire !
And it spells out S. O. S !
Whoa !
It' s a meteor shower !
Fireflies !
Go !
Run !
You' re free !
You' re free !
Ouch !
Ouch !
Yeah !
Yes !
Look what I have created !
I have made fire !
I ... have made fire !
Mmm !
You gotta love crab .
I n the nick oftime too .
I couldn' t take much more ofthose coconuts .
Coconut milk' s a natural laxative .
Things that Gilligan never told us .
Oh .
Pretty well- made fire , huh , Wilson ?
[ Thunderclap ]
So ...
Wilson .
We were en route ... from Memphis for 1 1 and a half hours .
A

## Data Preprocessing

In [8]:
fstopwords = open('nltk_data/corpora/stopwords/english', 'r')
stopwords = fstopwords.read().split('\n')

In [9]:
vectorizer = TfidfVectorizer(analyzer='word', min_df=15, max_features=1000, norm='l2',
                             stop_words=stopwords, use_idf=True, smooth_idf=True)
X_train_vec = vectorizer.fit_transform(X_train['subtitles'])

In [28]:
X_train_vec[0]
# X_train_vec.eliminate_zeros()

<1x1000 sparse matrix of type '<type 'numpy.float64'>'
	with 372 stored elements in Compressed Sparse Row format>

In [29]:
X_train_vec.nnz / float(X_train_vec.shape[0])

497.905055487053

In [30]:
print X_train_vec[0]
# print X_train_vec.getrow(0)
# vectorized[0]

  (0, 960)	0.0233423422341
  (0, 93)	0.0436701161784
  (0, 128)	0.0783606547284
  (0, 402)	0.0197635227011
  (0, 485)	0.0383185046013
  (0, 245)	0.015815421352
  (0, 923)	0.0781770781618
  (0, 539)	0.10875733891
  (0, 256)	0.0469074003787
  (0, 387)	0.0305809695705
  (0, 956)	0.0119600301367
  (0, 680)	0.0372582887463
  (0, 938)	0.0536467627951
  (0, 120)	0.0141531392941
  (0, 623)	0.0124714927662
  (0, 718)	0.0708277018332
  (0, 461)	0.0431926854709
  (0, 647)	0.00927788417793
  (0, 819)	0.0353732804824
  (0, 484)	0.0157673978251
  (0, 428)	0.127510498296
  (0, 728)	0.0215420101817
  (0, 61)	0.0264950477558
  (0, 949)	0.0128240786482
  (0, 249)	0.0442421106926
  :	:
  (0, 390)	0.0111591942828
  (0, 89)	0.0301537268601
  (0, 902)	0.0130848777137
  (0, 266)	0.0327748419887
  (0, 839)	0.0153466017465
  (0, 928)	0.0333503186399
  (0, 507)	0.0136685466708
  (0, 305)	0.0179897145356
  (0, 398)	0.0181139326258
  (0, 706)	0.00954509646368
  (0, 911)	0.0117301578276
  (0, 935)	0.0374584036267


In [31]:
vectorized = []

In [32]:
for vec in X_train_vec:
    vectorized.append(list(vec.indices))    

In [34]:
# Sequence padding
vectorized = pad_sequences(vectorized, maxlen=1000, value=0.)
vectorized = pad_sequences(vectorized, maxlen=1000, value=0.)

In [35]:
# Converting labels to binary vectors
le = preprocessing.LabelEncoder()
y_train_transformed = le.fit_transform(y_train)
y_test_transformed = le.fit_transform(y_test)

y_train_categorical = to_categorical(y_train_transformed, nb_classes=17)
y_test_categorical = to_categorical(y_test_transformed, nb_classes=17)

In [37]:
# Network building
net = tflearn.input_data([None, 1000])
net = tflearn.embedding(net, input_dim=1000, output_dim=128)
net = tflearn.lstm(net, 128, dropout=0.8)
net = tflearn.fully_connected(net, 17, activation='softmax')
net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy')

In [38]:
# Training
model = tflearn.DNN(net, tensorboard_verbose=0, tensorboard_dir='./logs/')
model.fit(vectorized, y_train_categorical, validation_set=0.1, show_metric=True, batch_size=32, run_id='dnn_test1d')

Training Step: 230  | total loss: [1m[32m1.92638[0m[0m
| Adam | epoch: 010 | loss: 1.92638 - acc: 0.2614 | val_loss: 1.88131 - val_acc: 0.2927 -- iter: 729/729
Training Step: 230  | total loss: [1m[32m1.92638[0m[0m
| Adam | epoch: 010 | loss: 1.92638 - acc: 0.2614 | val_loss: 1.88131 - val_acc: 0.2927 -- iter: 729/729
--


In [45]:
print np.max(vectorized), np.min(vectorized)
print np.max(y_train), np.min(y_train)
print np.max(trainX), np.min(trainX)
print np.max(trainY), np.min(trainY)

67928 2
1.0 0.0
9999 0
1.0 0.0
