In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data import and first analysis
 
Let's start by import the data and give it a look.

In [None]:
import pandas as pd



df = pd.read_csv('/kaggle/input/marvel-vs-dc-imdb-dataset/Marvel_DC_imdb.csv')

del df['Unnamed: 0']
print(df.head())
df.describe()

There are a lot of nans. We can visualize them easily.

In [None]:
import seaborn as sns

sns.heatmap(df.isnull())

The only numeric features are IMBD scores and metascores. I will create another one to distinguish Marvel and DC in a boolean way.

In [None]:
df['MvsDC'] = df['Category'] == 'Marvel'
print(df.head())

In [None]:
df_marvel = df[df['MvsDC']]
df_DC = df[df['MvsDC'] == False]
df_marvel['IMDB_Score'].dropna()
df_DC['IMDB_Score'].dropna()
print(df_DC.head())

In [None]:
df_marvel.info()

In [None]:
df_DC.info()

In [None]:
print(df_marvel.describe())
print(df_DC.describe())

We can see the average for the two numerical features of the movies. In particular we see that the IMDB score for DC is higher than marvel, while it is the opposite for the metascore.

# Score classification by description

We go for a kind of crazy idea. We want to understand if a superhero movie is good only through its description. We will use the spacy library for this, as feature the description column, and as target the IMDB score. 

In [None]:
import spacy 
from math import floor

nlp = spacy.load("en_core_web_sm")

nennenWe need to provide the data in the correct fashion. For a multiclassification problem i will have ten classes, from 0 to 9, which are the floor values of the imdb scores. They will be encoded as one-hot dictionaries in votes.

In [None]:
df.dropna(inplace=True,subset=['IMDB_Score'])

descr = [nlp(doc) for doc in  df['Description'] ]
votes =[]
score_keys=['0','1','2','3','5','6','7','8','9']
#for imdb in df['IMDB_Score']:
#    votes.append({ sc: floor(imdb) == int(sc) }for sc in score_keys  )
votes = [{'0': floor(imdb) == 0,'1': floor(imdb) == 1,'2': floor(imdb) == 2,'3': floor(imdb) == 3,
         '4': floor(imdb) == 4,'5': floor(imdb) == 5,'6': floor(imdb) == 6,'7': floor(imdb) == 7,
         '8': floor(imdb) == 8,'9': floor(imdb) == 9} for imdb in df['IMDB_Score']]
    
votes  = [{"cats": labels} for labels in votes]
    

for ent in descr[30].ents:
    print(ent.text)
    
print(votes[30])

Here we create the data in a way that spacy likes. A tuple with strings and labels.

In [None]:
from sklearn.model_selection import train_test_split


X = descr
ylabels = votes

X_train, X_test,y_train,y_test = train_test_split(X,ylabels,test_size=0.001)

#y_train = np.array(y_train)
#y_oh = OneHotEncoder(sparse=False)
#y_oh = y_oh.fit_transform(y_train.reshape(len(y_train),1))

X_sp = [(X_train[i],y_train[i]) for i in range(len(y_train))]
print(X_sp[:1])

We prepare the model for training.

In [None]:
from spacy.util import minibatch


#textcat = nlp.create_pipe(
#              "textcat_multilabel")
n_iter = 10
textcat = nlp.create_pipe('textcat',
              config={
                "exclusive_classes": True,
                "architecture": "ensemble"})




textcat.add_label("0")
textcat.add_label("1")
textcat.add_label("2")
textcat.add_label("3")
textcat.add_label("4")
textcat.add_label("5")
textcat.add_label("6")
textcat.add_label("7")
textcat.add_label("8")
textcat.add_label("9")


nlp.add_pipe(textcat)

textcat.labels


We define the training function. 

In [None]:
import random

def train(model, train_data, optimizer):
    losses = {}
    random.seed(1)
    random.shuffle(train_data)
    
    batches = minibatch(train_data, size=8)
    for batch in batches:
        # train_data is a list of tuples [(text0, label0), (text1, label1), ...]
        # Split batch into texts and labels
        texts, labels = zip(*batch)
        
        # Update model with texts and labels
        model.update(texts,labels,sgd=optimizer,losses=losses)
        
    return losses

In [None]:
optimizer = nlp.begin_training()
train_data = X_sp
n_iter =5
for i in range(n_iter):
    losses = train(nlp, train_data, optimizer)
    print(losses['textcat'])

We have a trained model capable of guessing the votes of a superhero movie given its description. lol

In [None]:
def predict(nlp, docs): 
    # Use the model's tokenizer to tokenize each input text
    #docs = [nlp.tokenizer(text) for text in texts]
    
    # Use textcat to get the scores for each doc
    textcat= nlp.get_pipe('textcat')
    scores,_ = textcat.predict(docs)
    print(scores)
    
    # From the scores, find the class with the highest score/probability
    predicted_class = scores.argmax(axis=1)
    
    return predicted_class

def find_key(input_dict, value):
    return next((k for k, v in input_dict.items() if v == value), None)

texts =X_test
true_sc =[]
for y in y_test:
    # list out keys and values separately
    true_sc.append(int(find_key(y['cats'],True)))

predictions = predict(nlp, texts)
true_pred = np.array(predictions == true_sc)
#print(predictions)
#print(true_sc)
#print(true_pred)
acc = sum(true_pred)/len(true_sc)

 
for p, t,sc in zip(predictions, texts,true_sc):
    print(f"{textcat.labels[p]}: {t} , true scores: {sc}  \n")
    
print('Accuracy = ', acc)    

The model seems to work very well!

An alternative would be to use vector embeddings, which we have already thanks to spacy processing, and use sklearn with a support vector machine.

Now we are ready to give votes to invented superhero movies description.

In [None]:
desc_inv = 'Green lantern spends the whole day doing taxes calculations.'

docs_inv = [nlp.tokenizer(desc_inv)]

print(docs_inv)

pred_inv= predict(nlp,docs_inv)

print(pred_inv)