In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle


In [3]:
movie = pd.read_csv('train_data.txt', delimiter=':::', names=['Index', 'Name', 'Genre', 'Desc'], engine='python')


In [29]:
movie.shape

(54214, 4)

In [4]:
movie.head()


Unnamed: 0,Index,Name,Genre,Desc
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [5]:
m1 = movie

In [6]:
m1.head()

Unnamed: 0,Index,Name,Genre,Desc
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [7]:
X = movie.drop('Genre', axis = 1)

In [8]:
X = X.drop('Index', axis = 1)

In [9]:
X.head()

Unnamed: 0,Name,Desc
0,Oscar et la dame rose (2009),Listening in to a conversation between his do...
1,Cupid (1997),A brother and sister with a past incestuous r...
2,"Young, Wild and Wonderful (1980)",As the bus empties the students for their fie...
3,The Secret Sin (1915),To help their unemployed father make ends mee...
4,The Unrecovered (2007),The film's title refers not only to the un-re...


In [10]:
X['Text'] = X['Name'] + ' ' + X['Desc']

In [11]:
X.head()

Unnamed: 0,Name,Desc,Text
0,Oscar et la dame rose (2009),Listening in to a conversation between his do...,Oscar et la dame rose (2009) Listening in t...
1,Cupid (1997),A brother and sister with a past incestuous r...,Cupid (1997) A brother and sister with a pa...
2,"Young, Wild and Wonderful (1980)",As the bus empties the students for their fie...,"Young, Wild and Wonderful (1980) As the bus..."
3,The Secret Sin (1915),To help their unemployed father make ends mee...,The Secret Sin (1915) To help their unemplo...
4,The Unrecovered (2007),The film's title refers not only to the un-re...,The Unrecovered (2007) The film's title ref...


In [12]:
X1 = X['Text']

In [13]:
X1.head()

0     Oscar et la dame rose (2009)   Listening in t...
1     Cupid (1997)   A brother and sister with a pa...
2     Young, Wild and Wonderful (1980)   As the bus...
3     The Secret Sin (1915)   To help their unemplo...
4     The Unrecovered (2007)   The film's title ref...
Name: Text, dtype: object

In [14]:
Y = movie['Genre']

In [32]:
Y.unique()

array([' drama ', ' thriller ', ' adult ', ' documentary ', ' comedy ',
       ' crime ', ' reality-tv ', ' horror ', ' sport ', ' animation ',
       ' action ', ' fantasy ', ' short ', ' sci-fi ', ' music ',
       ' adventure ', ' talk-show ', ' western ', ' family ', ' mystery ',
       ' history ', ' news ', ' biography ', ' romance ', ' game-show ',
       ' musical ', ' war '], dtype=object)

In [15]:
Y.head()

0        drama 
1     thriller 
2        adult 
3        drama 
4        drama 
Name: Genre, dtype: object

In [16]:
X = X.where((pd.notnull(X)), '')

In [17]:
Y.head()

0        drama 
1     thriller 
2        adult 
3        drama 
4        drama 
Name: Genre, dtype: object

In [18]:
Y.head()

0        drama 
1     thriller 
2        adult 
3        drama 
4        drama 
Name: Genre, dtype: object

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X1, Y, random_state= 2, test_size= 0.2)

In [20]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)




In [21]:
X_train_features


<43371x118475 sparse matrix of type '<class 'numpy.float64'>'
	with 2069680 stored elements in Compressed Sparse Row format>

In [22]:
classifier = LogisticRegression()
classifier.fit(X_train_features, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [38]:
Y_pred = classifier.predict(X_train_features)
accuracy = accuracy_score(Y_pred, Y_train)
accuracy


0.7166078716192847

In [41]:
input = ['abcdsdhfaiufhaiufgasuifgafiuavfiuabfiuasdfhiuasdfhuioasdhiuoasdhauiosdhouasidhoiasdhasiodhasd']
input_features = feature_extraction.transform(input)
prediction = classifier.predict(input_features)


In [42]:
print(prediction)

[' drama ']


In [43]:
filename = 'feature_extraction.sav'
pickle.dump(feature_extraction, open(filename, 'wb'))

In [28]:
filename = 'movie_model.sav'
pickle.dump(classifier, open(filename, 'wb'))