In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('kaggle_movie_train.csv')

In [4]:
df.shape

(22579, 3)

In [5]:
df.head()

Unnamed: 0,id,text,genre
0,0,"eady dead, maybe even wishing he was. INT. 2ND...",thriller
1,2,"t, summa cum laude and all. And I'm about to l...",comedy
2,3,"up Come, I have a surprise.... She takes him ...",drama
3,4,ded by the two detectives. INT. JEFF'S APARTME...,thriller
4,5,"nd dismounts, just as the other children reach...",drama


In [6]:
df.genre.value_counts(1)*100

drama        39.297577
thriller     30.222773
comedy       13.025378
action       10.593915
sci-fi        2.714912
horror        2.019576
other         1.195801
adventure     0.651047
romance       0.279020
Name: genre, dtype: float64

In [7]:
df.loc[1]['text']

"t, summa cum laude and all. And I'm about to launch a brand new magazine called EXPOSED! An homage to Miss Julie Conroy of Xenia, Ohio. Julie grins. JULIE I know where you can find an excellent editor in chief. TED Yellow pages? JULIE Let your fingers do the walking. Suddenly the music changes. People. Ted grins. TED They're playing our song. extending his hand Dare I ask for this dance? JULIE taking his hand You better. Ted and Julie begin dancing and kissing in the b.g. Charlie and Jimmy feign tears. CHARLIE I'm a sucker for a happy ending. hugging Jimmy Hold me. And we start to RISE AGAIN, above the NELSON HOUSE, into the clouds above Xenia... TED V.O. So, as you can guess, everybody pretty much lived happily ever after. My parents didn't give up the grocery store... We descend through clouds and quickly find we're... EXT. LONDON BUCKINGHAM PALACE DAY Mom and Dad take pictures and smooch in front of the palace. TED V.O. ...but they did manage to sneak away for a second honeymoon. O

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22579 entries, 0 to 22578
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      22579 non-null  int64 
 1   text    22579 non-null  object
 2   genre   22579 non-null  object
dtypes: int64(1), object(2)
memory usage: 529.3+ KB


### Preprocess

In [9]:
df.isnull().sum()

id       0
text     0
genre    0
dtype: int64

In [10]:
df = df.drop('id',axis=1)

In [11]:
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
ps = PorterStemmer()
corpus=[]
for i in range(0,len(df)):
    message = re.sub('[^a-zA-Z]',' ',df['text'][i])
    message = message.lower()
    message = message.split()
    message = [ps.stem(word) for word in message if word not in set(stopwords.words('english'))]
    message = ' '.join(message)
    corpus.append(message)

In [12]:
len(corpus)

22579

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(corpus,df['genre'],test_size=0.30,random_state=42)

In [14]:
len(X_train)

15805

### Convert Word to Vector

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [16]:
X_train_vect.shape

(15805, 36169)

In [17]:
X_test_vect.shape

(6774, 36169)

In [18]:
#dump tfidf
import pickle
pickle.dump(vectorizer, open('tfidf-transform.pkl', 'wb'))

In [19]:
from imblearn.over_sampling import SMOTE
x_resample, y_resample = SMOTE().fit_sample(X_train_vect, y_train)
# lets print the shape of x and y after resampling it
print(x_resample.shape)
print(y_resample.shape)

(55926, 36169)
(55926,)


In [20]:
x_test_resample, y_test_resample = SMOTE().fit_sample(X_test_vect, y_test)

### Model

In [21]:
from sklearn.metrics import accuracy_score, classification_report

In [22]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='lbfgs')
clf.fit(x_resample, y_resample)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [23]:
y_pred = clf.predict(x_test_resample)
accuracy_score(y_test_resample,y_pred)

0.9550791859930634

In [24]:
cr = classification_report(y_test_resample, y_pred)
print(cr)

              precision    recall  f1-score   support

      action       0.98      0.96      0.97      2659
   adventure       1.00      0.96      0.98      2659
      comedy       0.97      0.94      0.96      2659
       drama       0.81      0.93      0.86      2659
      horror       0.99      0.97      0.98      2659
       other       1.00      0.99      0.99      2659
     romance       1.00      0.96      0.98      2659
      sci-fi       0.98      0.97      0.97      2659
    thriller       0.89      0.93      0.91      2659

    accuracy                           0.96     23931
   macro avg       0.96      0.96      0.96     23931
weighted avg       0.96      0.96      0.96     23931



In [None]:
from lightgbm import LGBMClassifier
lgb = LGBMClassifier().fit(x_resample, y_resample)

In [None]:
y_pred = lgb.predict(x_test_resample)
accuracy_score(y_test_resample,y_pred)

In [74]:
cr = classification_report(y_test_resample, y_pred)
print(cr)

              precision    recall  f1-score   support

      action       0.96      0.93      0.94      2659
   adventure       1.00      0.99      0.99      2659
      comedy       0.93      0.91      0.92      2659
       drama       0.77      0.93      0.84      2659
      horror       1.00      0.94      0.97      2659
       other       1.00      0.94      0.97      2659
     romance       1.00      0.93      0.96      2659
      sci-fi       0.97      0.93      0.95      2659
    thriller       0.81      0.88      0.84      2659

    accuracy                           0.93     23931
   macro avg       0.94      0.93      0.93     23931
weighted avg       0.94      0.93      0.93     23931



In [67]:
# Training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(x_resample, y_resample)

MultinomialNB()

In [68]:
y_pred = mnb.predict(x_test_resample)
accuracy_score(y_test_resample,y_pred)

0.9636872675609043

In [63]:
cr = classification_report(y_test_resample, y_pred)
print(cr)

              precision    recall  f1-score   support

      action       0.97      0.97      0.97      2659
   adventure       1.00      0.98      0.99      2659
      comedy       0.95      0.95      0.95      2659
       drama       0.92      0.87      0.90      2659
      horror       0.96      1.00      0.98      2659
       other       1.00      0.99      0.99      2659
     romance       1.00      1.00      1.00      2659
      sci-fi       0.96      0.99      0.98      2659
    thriller       0.91      0.91      0.91      2659

    accuracy                           0.96     23931
   macro avg       0.96      0.96      0.96     23931
weighted avg       0.96      0.96      0.96     23931



### Predict

In [None]:
strr = input("Enter a Message: ")
examples = strr

#preprocess
a = re.sub('[^a-zA-Z]',' ',examples)
a = a.lower()
a = a.split()
a = [ps.stem(word) for word in a if word not in set(stopwords.words('english'))]
a = ' '.join(a)  
print(a)
#apply
example_counts = vectorizer.transform([a])
prediction =clf.predict(example_counts)
prediction[0]

In [65]:
#dumb file
import pickle
filename = 'gener_logistic.pkl'
pickle.dump(clf, open(filename, 'wb'))

In [66]:
#dumb file
import pickle
filename = 'gener_mnb.pkl'
pickle.dump(mnb, open(filename, 'wb'))