In [None]:
import numpy as np 
import pandas as pd 

df = pd.read_csv('../input/superheroes-nlp-dataset/superheroes_nlp_dataset.csv')
df.head(3)

In [None]:
cdf = df[['creator', 'history_text']]
cdf.head(3)

In [None]:
print('Null Values')
print(cdf.isnull().sum())
print('________________________')
print(cdf['creator'].value_counts())

In [None]:
cdf = cdf.dropna()
print(cdf.isnull().sum())
print('--------------------------------')
print(cdf['creator'].value_counts())

In [None]:
mask1 = cdf.loc[(cdf['creator'] == 'Marvel Comics' )]
mask2 = cdf.loc[(cdf['creator'] == 'DC Comics' )]
frames = [mask1, mask2]
cdf = pd.concat(frames)
cdf

In [None]:
print(cdf.isnull().sum())
print('--------------------------------')
print(cdf['creator'].value_counts())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.countplot(cdf['creator'], palette="plasma")
fig = plt.gcf()
fig.set_size_inches(8,5)
plt.title('Score')

In [None]:
features = []
target = []

In [None]:
for i in cdf['history_text']:
    features.append(i)
    
for i in cdf['creator']:
    target.append(i)
    

In [None]:
Cloud_Marvel = cdf['history_text'].loc[(cdf.creator == 'Marvel Comics')]
Cloud_Marvel = Cloud_Marvel.sum()
Cloud_DC = cdf['history_text'].loc[(cdf.creator == 'DC Comics')]
Cloud_DC = Cloud_DC.sum()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline 
from wordcloud import WordCloud
wordcloud = WordCloud(max_font_size=300,background_color="black").generate(Cloud_Marvel)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline 
from wordcloud import WordCloud
wordcloud = WordCloud(max_font_size=300,background_color="black").generate(Cloud_DC)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

# Modeling

> The problems is simple, then I'll use simple models.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Stopwords

In [None]:
from nltk.corpus import stopwords  
stopwords = set(stopwords.words('english'))  
stopwords

## Stemmer

In [None]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))


## Vectorizer

In [None]:

# Unigram and Bigram no TF , 1 & 2

vect1 = CountVectorizer(stop_words = stopwords)
vect2 = CountVectorizer(stop_words = stopwords , ngram_range = (1,2))

X_t1 = vect1.fit_transform(features)
X_t2 = vect2.fit_transform(features)

# Unigram and Bigram with TF 3 & 4

tf1 = TfidfVectorizer(use_idf=False, norm = "l1", stop_words = stopwords)
tf2 = TfidfVectorizer(use_idf=False, norm = "l1", stop_words = stopwords, ngram_range = (1, 2))

X_t3 = tf1.fit_transform(features)
X_t4 = tf2.fit_transform(features)

# Unigram and Bigram with TF-IDF 5 & 6

tf_idf1 = TfidfVectorizer(norm="l1", stop_words = stopwords)
tf_idf2 = TfidfVectorizer(norm = "l1", stop_words = stopwords, ngram_range = (1, 2))

X_t5 = tf_idf1.fit_transform(features)
X_t6 = tf_idf2.fit_transform(features)

# Unigram and Bigram stemmed

stem_vect1 = CountVectorizer(analyzer=stemmed_words,stop_words = stopwords)
stem_vect2 = CountVectorizer(analyzer=stemmed_words,stop_words = stopwords,ngram_range = (1,2))

X_t7 = stem_vect1.fit_transform(features)
X_t8 = stem_vect2.fit_transform(features)

# Unigram and Bigram stemmed with term frequency

stem_tf1 = TfidfVectorizer(analyzer=stemmed_words, use_idf=False, norm = "l1", stop_words = stopwords)
stem_tf2 = TfidfVectorizer(analyzer=stemmed_words, use_idf=False, norm = "l1", stop_words = stopwords, ngram_range = (1, 2))

X_t9 = stem_tf1.fit_transform(features)
X_t10 = stem_tf2.fit_transform(features)

# Unigram and Bigram stemmed with term frequency using IDF
stem_tf_idf1 = TfidfVectorizer(analyzer=stemmed_words, norm="l1", stop_words = stopwords)
stem_tf_idf2 = TfidfVectorizer(analyzer=stemmed_words, norm = "l1", stop_words = stopwords, ngram_range = (1, 2))

X_t11 = stem_tf_idf1.fit_transform(features)
X_t12 = stem_tf_idf2.fit_transform(features)

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_t1, target, test_size = .2, random_state = 42)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_t2, target, test_size = .2, random_state = 42)
X_train3, X_test3, y_train3, y_test3 = train_test_split(X_t3, target, test_size = .2, random_state = 42)
X_train4, X_test4, y_train4, y_test4 = train_test_split(X_t4, target, test_size = .2, random_state = 42)
X_train5, X_test5, y_train5, y_test5 = train_test_split(X_t5, target, test_size = .2, random_state = 42)
X_train6, X_test6, y_train6, y_test6 = train_test_split(X_t6, target, test_size = .2, random_state = 42)
X_train7, X_test7, y_train7, y_test7 = train_test_split(X_t7, target, test_size = .2, random_state = 42)
X_train8, X_test8, y_train8, y_test8 = train_test_split(X_t8, target, test_size = .2, random_state = 42)
X_train9, X_test9, y_train9, y_test9 = train_test_split(X_t9, target, test_size = .2, random_state = 42)
X_train10, X_test10, y_train10, y_test10 = train_test_split(X_t10, target, test_size = .2, random_state = 42)
X_train11, X_test11, y_train11, y_test11 = train_test_split(X_t11, target, test_size = .2, random_state = 42)
X_train12, X_test12, y_train12, y_test12 = train_test_split(X_t12, target, test_size = .2, random_state = 42)

X_train = [X_train1, X_train2, X_train3, X_train4, X_train5, X_train6, X_train7, X_train8, X_train9, X_train10, X_train11, X_train12]
X_test = [X_test1, X_test2, X_test3, X_test4, X_test5, X_test6, X_test7, X_test8, X_test9, X_test10, X_test11, X_test12]
y_train = [y_train1, y_train2, y_train3, y_train4, y_train5, y_train6, y_train7, y_train8, y_train9, y_train10, y_train11, y_train12]
y_test = [y_test1, y_test2, y_test3, y_test4, y_test5, y_test6, y_test7, y_test8, y_test9, y_test10, y_test11, y_test12]

## Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
NB = []
for a, b in zip(X_train, y_train):
    NB.append(MultinomialNB().fit(a, b))

In [None]:
NB_acc = []

for a, b, c in zip(NB, X_test, y_test):
    predict = a.predict(b)
    acc = accuracy_score(c, predict)
    NB_acc.append(acc)
    
print(NB_acc)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
LR = []
for a, b in zip(X_train, y_train):
    LR.append(LogisticRegression(solver = 'liblinear', random_state=42).fit(a, b))

In [None]:
LR_acc = []

for a, b, c in zip(LR, X_test, y_test):
    predict = a.predict(b)
    acc = accuracy_score(c, predict)
    LR_acc.append(acc)
    
print(LR_acc)

## XG BOOST

In [None]:
import xgboost as xgb
XGBoost = []
for a, b in zip(X_train, y_train):
    XGBoost.append(xgb.XGBClassifier(objective="binary:logistic", random_state=42).fit(a, b))

In [None]:
XGBoost_acc = []

for a, b, c in zip(XGBoost, X_test, y_test):
    predict = a.predict(b)
    acc = accuracy_score(c, predict)
    XGBoost_acc.append(acc)
    
print(XGBoost_acc)

## Support Vector Machine

In [None]:
from sklearn.svm import SVC
SVM = []
for a, b in zip(X_train, y_train):
    SVM.append(SVC().fit(a, b))

In [None]:
SVM_acc = []

for a, b, c in zip(SVM, X_test, y_test):
    predict = a.predict(b)
    acc = accuracy_score(c, predict)
    SVM_acc.append(acc)
    
print(SVM_acc)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
Forest = []
for a, b in zip(X_train, y_train):
    Forest.append(RandomForestClassifier(max_depth=2, random_state=42).fit(a,b))

In [None]:
Forest_acc = []

for a, b, c in zip(Forest, X_test, y_test):
    predict = a.predict(b)
    acc = accuracy_score(c, predict)
    Forest_acc.append(acc)
    
print(Forest_acc)

## Decision Tree

In [None]:
from sklearn import tree as tree_model 
Tree = []
for a, b in zip(X_train, y_train):
    Tree.append(tree_model.DecisionTreeClassifier().fit(a, b))

In [None]:
Tree_acc = []

for a, b, c in zip(Tree, X_test, y_test):
    predict = a.predict(b)
    acc = accuracy_score(c, predict)
    Tree_acc.append(acc)
    
print(Tree_acc)


# The Best

In [None]:
d = {
    'MultinomialNB': NB_acc, 'Logistic Regression': LR_acc,
    'XGBoost': XGBoost_acc, 'SVM': SVM_acc, 'Random Forest': Forest_acc,
     'Decision Tree': Tree_acc
    }

best_acc =pd.DataFrame(data=d)
best_acc.head(12)

In [None]:
import seaborn as sns
ax = sns.heatmap(best_acc, annot=True)

# Some tests

In [None]:
DC = ["A boy lost your parents with eight years old, he was train and later became a vigilant in his city!",
           
"A baby falls down in a farm, he was born on the planet Krypton and was given the name Kal-El at birth. As a baby, his parents sent him to Earth in a small spaceshipmoments before his planet was destroyed in a natural cataclysm.Now he resides in the fictional American city of Metropolis, where he works as a journalist for the Daily Planet.",

"He is a vigilant in Star City, His main weapon is a bow and arrow, his favorite color is Green",
           
"Princess Diana of an all-female Amazonian race rescues US pilot Steve. Upon learning of a war, she ventures into the world of men to stop Ares, the god of war, from destroying mankind.",
      
"He is the fastest man alive, sometimes actually, when he was a kid, his mom was murdered from a yellow man, a yellow thing"          
          ]

Marvel = ["A wealthy American business magnate, playboy, and ingenious scientist, he suffers a severe chest injury during a kidnapping. When his captors attempt to force him to build a weapon of mass destruction, he instead creates a mechanized suit of armor to save his life and escape captivity.",
 "The history is simple, He is the Wakanda King","His Father is Odin",
           
"He was a normal scientist falling in love for a beautiful scientist, but now when he is Angry, he become a green monster, and everybody call him HULK",
"He is a good person, a good hero, an old hero, he is a captain, a leader, currently he is an avenger."
     ]

In [None]:

Marvel1 = vect1.transform(Marvel)
Marvel2 = vect2.transform(Marvel)
Marvel3 = tf1.transform(Marvel)
Marvel4 = tf2.transform(Marvel)
Marvel5 = tf_idf1.transform(Marvel)
Marvel6 = tf_idf2.transform(Marvel)
Marvel7 = stem_vect1.transform(Marvel)
Marvel8 = stem_vect2.transform(Marvel)
Marvel9 = stem_tf1.transform(Marvel)
Marvel10 = stem_tf2.transform(Marvel)
Marvel11 = stem_tf_idf1.transform(Marvel)
Marvel12 = stem_tf_idf2.transform(Marvel)

DC1 = vect1.transform(DC)
DC2 = vect2.transform(DC)
DC3 = tf1.transform(DC)
DC4 = tf2.transform(DC)
DC5 = tf_idf1.transform(DC)
DC6 = tf_idf2.transform(DC)
DC7 = stem_vect1.transform(DC)
DC8 = stem_vect2.transform(DC)
DC9 = stem_tf1.transform(DC)
DC10 = stem_tf2.transform(DC)
DC11 = stem_tf_idf1.transform(DC)
DC12 = stem_tf_idf2.transform(DC)

## The better
Testing with the better algorithm we have, MultinomialNB V2 (vectorizer with bigram)

In [None]:
print(NB[1].predict(DC2))
print('-'*20)
print(NB[1].predict(Marvel2))

## Other tests

In [None]:
print(LR[1].predict(DC2))
print('-'*20)
print(LR[1].predict(Marvel2))

In [None]:
print(SVM[11].predict(DC12))
print('-'*20)
print(SVM[11].predict(Marvel12))

# Conclusions

The better in all tests is the MultinomialNB V2, trained with the dataset of bigrams.

In this dataset, we can see some words there is only in a histories from DC Comics, like cities, so Gotham for example is a powerful keyword that does not exist in texts from Marvel creator.



