**Import the necessary libraries**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import imblearn
import matplotlib.pyplot as plt
%matplotlib inline 

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os

print("Reading data")
print(os.listdir("../input"))

##For data preprocessing
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

##For Machine Learning
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer


from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
# Any results you write to the current directory are saved as output.

** Load the dataset and view the data **

In [None]:
df = pd.read_csv("../input/wiki_movie_plots_deduped.csv")
df.tail()
df.info()

**Getting rid of the movies whose Genre is unkown**

In [None]:
df['Genre']=df['Genre'].replace('unknown',np.nan)
df=df.dropna(axis=0, subset=['Genre'])
print(df.tail())

**Keeping only the Top 20 Movie Genres in terms of occurrence**

In [None]:
print(len(df))
print(df.shape)
a=df['Genre'].value_counts()[:20]
b=a.keys().tolist()
print(b)
df=df[df.Genre.isin(b)]
df=df.reset_index(drop=True)



**Plot the number of occurrences of  most commonly occurring movies**

In [None]:
sns.set(style="white")
genre_to_count=pd.DataFrame({'Genre':a.index, 'Count':a.values})
plt.figure(figsize=(15,10))
sns.barplot(y="Genre", x="Count", data=genre_to_count,palette="Blues_d")

relabeling:
comedy, drama as comedy 
romantic, comedy as romance
crime, drama as crime


In [None]:
df['Genre'] = df['Genre'].replace({'comedy, drama': 'comedy', 'romantic comedy': 'romance', 'crime drama': 'crime', 'sci-fi': 'science fiction'})
df =df.drop(['Release Year','Title','Origin/Ethnicity','Cast','Director','Wiki Page'],axis=1)

In [None]:
df['Genre'] = df['Genre'].replace({'war':'action','animation':'family','musical':'family','mystery':'thriller','film noir':'crime','western':'action','adventure':'family','horror':'thriller'})

In [None]:
sns.set(style='white')
fig, ax = plt.subplots(figsize=(6,10))
sns.countplot(ax=ax, y="Genre", data=df,palette="Blues_d")

In [None]:
def plotToWords(raw_plot):
    letters_only = re.sub("[^a-zA-Z]", " ", raw_plot)
    lower_case = letters_only.lower()
    words = lower_case.split()
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if not w in stops]
    return (" ".join(meaningful_words))

def preprocess(dataframe):
    clean_train_reviews = []
    for i in range(0,len(dataframe)):
        clean_train_reviews.append(plotToWords(dataframe.iloc[i]['Plot']))
    dataframe['Plot']=clean_train_reviews
    return dataframe

df=preprocess(df)
print(df["Plot"][:10])

## Benchmark before oversampling

In [None]:

X_train, X_test, y_train, y_test = train_test_split(df['Plot'], df['Genre'], random_state = 0)


In [None]:

from sklearn.preprocessing import LabelEncoder
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1,3), max_features=6000)
features = tfidf.fit_transform(df.Plot).toarray()
labels = df.Genre


In [None]:

models = [
    LinearSVC(multi_class='ovr'),
    MultinomialNB(),
    LogisticRegression(random_state=32,multi_class='ovr'),
    RandomForestClassifier(n_estimators = 30, criterion = 'entropy', random_state = 42)
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
import seaborn as sns
fig, ax1 = plt.subplots(figsize=(6,10))
fig, ax2 = plt.subplots(figsize=(6,10))
sns.boxplot(ax=ax1,x='model_name', y='accuracy', data=cv_df)
sns.stripplot(ax=ax2,x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()

In [None]:
cv_df.groupby('model_name').accuracy.mean()

## Oversample randomly


In [None]:
!pip install google_trans_new

In [None]:
from google_trans_new import google_translator
translator = google_translator()  

def translatePlot(x):
    print(x)        
    translated = translator.translate(x, lang_tgt='ge') 
    english_translation = translator.translate(translated, lang_tgt='en')
    return english_translation



In [None]:
translatePlot(df.loc[1,'Plot'])

In [None]:
def generateSamples(df):
    minor_class = ['comedy','romance','horror','action','crime','thriller','western','science fiction','adventure',
                        'musical','film noir','mystery','war','animation','family']
    majority_count=df[df.Genre == 'drama'].shape[0]
    majority_class='drama'
    for genre in minor_class:
        if genre in df.Genre.unique():
            sample_size=majority_count-df[df.Genre == genre].shape[0]
            temp = df[df.Genre == genre].sample(sample_size,random_state=2,replace=True)
            temp['Genre']=genre
            temp['Plot']= translatePlot(temp['Plot'])
            df=pd.concat([df,temp])
    return df
        
      
        
        
    

## Classifying with Machine Learning

## Use models for - 
* Linear Support Vector Classifier
* Multinomial Naive Bayes 
* Logistic Regression
* Plot how each of them performs on the dataset


In [None]:

example_params = {
        'n_estimators': 100,
        'max_depth': 5,
        'random_state': 13
    }

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
models = [
    LinearSVC(multi_class='ovr'),
    MultinomialNB(),
    LogisticRegression(random_state=32,multi_class='ovr'),
    RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
]

In [None]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
def score_model(d):
    CV=5
    cv = KFold(n_splits=CV, random_state=42,shuffle=True)
    scores = []
    entries = []
    label_enc =LabelEncoder()
    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 3), max_features=4000)
    cv_df = pd.DataFrame(index=range(CV * len(models)))
    for model in models:
        for train_fold_index, test_fold_index in cv.split(d.Plot,d.Genre):
            X_train_fold, y_train_fold = d['Plot'].iloc[train_fold_index], d['Genre'].iloc[train_fold_index]
            X_val_fold, y_val_fold = d['Plot'].iloc[test_fold_index], d['Genre'].iloc[test_fold_index]

            d_new=generateSamples(pd.concat([X_train_fold, y_train_fold], axis=1))

            X_train_fold_upsample, y_train_fold_upsample = d_new.Plot,d_new.Genre
            features = tfidf.fit_transform(X_train_fold_upsample).toarray()
            labels = label_enc.fit(y_train_fold_upsample)
            labels = label_enc.transform(y_train_fold_upsample)
            X_test = tfidf.transform(X_val_fold).toarray()
            
            model_name = model.__class__.__name__
            clf = model.fit(features, labels)
            prediction = model.predict(X_test)
            score = accuracy_score(prediction, label_enc.transform(y_val_fold))
            entries.append((model_name, score))
    cv_df = pd.DataFrame(entries, columns=['model_name','accuracy'])
    import seaborn as sns
    sns.boxplot(x='model_name', y='accuracy', data=cv_df)
    sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
    size=8, jitter=True, edgecolor="gray", linewidth=2)
    plt.show()
    return cv_df
    

In [None]:
cv_df = score_model(df)

In [None]:
cv_df.groupby('model_name').accuracy.mean()

## If we test on our oversampled data as well - 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['Plot'], df['Genre'], random_state = 0)
d=generateSamples(pd.concat([X_train, y_train], axis=1))
X_train,y_train = d.Plot,d.Genre
fig, ax = plt.subplots(figsize=(6,10))
sns.countplot(ax=ax, y="Genre", data=d,palette="Blues_d")


In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 3), max_features=4000)
features = tfidf.fit_transform(d.Plot).toarray()
labels = d.Genre
features.shape

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
models = [
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
import seaborn as sns
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()

In [None]:
cv_df

**The average accuracies are:**

In [None]:
cv_df.groupby('model_name').accuracy.mean()


## Adjust Weights of Minority Class

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['Plot'], df['Genre'], random_state = 5)

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 3), max_features=4000)
features = tfidf.fit_transform(df.Plot).toarray()
labels = df.Genre

In [None]:
from sklearn.utils import class_weight

entries= [] 
'''
class_weight = class_weight.compute_class_weight('balanced',
                                                np.unique(y_train),
                                                y_train)
print(class_weight)
'''
model = LogisticRegression(class_weight = 'balanced')
accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=10)
model_name = model.__class__.__name__
for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

cv_df.groupby('model_name').accuracy.mean()



## LSTM

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

print(tf.__version__)

Hyperparamateres

In [None]:
df

In [None]:
vocab_size = 6000
embedding_dim = 64
max_length = 300
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8

In [None]:
df.iloc[0,0]

In [None]:
plots = []
genres = []

for i in range(len(df)):
    genres.append(df.iloc[i,0])
    plot = df.iloc[i,1]
    for word in STOPWORDS:
        token = ' ' + word + ' '
        plot = plot.replace(token, ' ')
        plot = plot.replace(' ', ' ')
    plots.append(plot)
print(len(genres))
print(len(plots))

In [None]:
plots[0]

In [None]:
train_size = int(len(plots) * training_portion)

train_plots = plots[0: train_size]
train_labels = genres[0: train_size]

validation_plots = plots[train_size:]
validation_labels = genres[train_size:]

print(train_size)
print(len(train_plots))
print(len(train_labels))
print(len(validation_plots))
print(len(validation_labels))

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_plots)
word_index = tokenizer.word_index
dict(list(word_index.items())[0:10])

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_plots)
print(train_sequences[10])

In [None]:
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(len(train_sequences[0]))
print(len(train_padded[0]))

print(len(train_sequences[1]))
print(len(train_padded[1]))

print(len(train_sequences[10]))
print(len(train_padded[10]))


validation_sequences = tokenizer.texts_to_sequences(validation_plots)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(len(validation_sequences))
print(len(validation_padded[0]))

In [None]:
print(validation_padded[0])

In [None]:

le = LabelEncoder()
Y = le.fit_transform(genres)
Y = Y.reshape(-1,1)
val_Y = Y[train_size:]
train_Y = Y[0:train_size]

In [None]:

model = tf.keras.Sequential([
    # Add an Embedding layer expecting input vocab of size 5000, and output embedding dimension of size 64 we set at the top
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),

    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    # Add a Dense layer with 8 units and softmax activation.
    # When we have multiple outputs, softmax convert outputs layers into a probability distribution.
    tf.keras.layers.Dense(8, activation='softmax')
])
model.summary()

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
num_epochs = 10
history = model.fit(train_padded, train_Y, epochs=num_epochs, validation_data=(validation_padded, val_Y), verbose=2)

In [None]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(history, "acc")
plot_graphs(history, "loss")