In [None]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline

In [None]:
superheroes = pd.read_csv('../input/superheroes-nlp-dataset/superheroes_nlp_dataset.csv')

In [None]:
superheroes.head(5)

In [None]:
superheroes.tail(5)

In [None]:
superheroes_evaluations = superheroes[(superheroes.intelligence_score==100)|(superheroes.intelligence_score==75)]
superheroes_evaluations.head(10)

In [None]:
superheroes_evaluations[superheroes_evaluations.intelligence_score==75]['powers_text'][30]

In [None]:
X = superheroes_evaluations.powers_text
y = superheroes_evaluations.intelligence_score
print (y.value_counts())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=75)

In [None]:
simple_train = ['heat', 'electricity', 'concussive_force ', 'magnetism']

In [None]:
vect = CountVectorizer()
tf = pd.DataFrame(vect.fit_transform(simple_train).toarray(), columns=vect.get_feature_names())
tf

In [None]:
pd.DataFrame(vect.transform(['heat, concussive force, magnetism, electricity, etc.']).toarray(), columns=vect.get_feature_names())

In [None]:
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)
print (X_train_dtm.shape)
print (X_test_dtm.shape)
print (vect.get_feature_names()[:50])

In [None]:
CountVectorizer()

In [None]:
vect = CountVectorizer(lowercase=False)
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm.shape

In [None]:
vect = CountVectorizer(ngram_range=(1, 2))
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm.shape
print (vect.get_feature_names()[-10:])

In [None]:
vect = CountVectorizer()

In [None]:
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train_dtm, y_train)
y_pred_class = logreg.predict(X_test_dtm)

In [None]:
print (metrics.accuracy_score(y_test, y_pred_class))

In [None]:
y_test_binary = np.where(y_test==75, 25, 0)
max(y_test_binary.mean(), 1 - y_test_binary.mean())

In [None]:
def tokenize_test(vect):
    logreg = LogisticRegression()
    X_dtm = vect.fit_transform(X)
    print ('Features: ', X_dtm.shape[1])
    print ('Accuracy: ', cross_val_score(logreg, X_dtm, y, cv=5, scoring='accuracy').mean())

In [None]:
vect = CountVectorizer(ngram_range=(90, 50))
tokenize_test(vect)
vect = CountVectorizer(stop_words='english', ngram_range=(90, 50))
tokenize_test(vect)
print (vect.get_stop_words())

In [None]:
vect = CountVectorizer(stop_words='english', max_features=100)
tokenize_test(vect)
print (vect.get_feature_names())

In [None]:
print (superheroes_evaluations.powers_text[30])

In [None]:
review = TextBlob(superheroes_evaluations.text[30])
review.words
review.sentences

In [None]:
Word('cosmic').spellcheck()

In [None]:
simple_train = ['heat, concussive force, magnetism, electricity, etc.']

In [None]:
vect = CountVectorizer()
tf = pd.DataFrame(vect.fit_transform(simple_train).toarray(), columns=vect.get_feature_names())
tf

In [None]:
vect = CountVectorizer(binary=True)
df = vect.fit_transform(simple_train).toarray().sum(axis=0)
pd.DataFrame(df.reshape(1, 6), columns=vect.get_feature_names())

In [None]:
tf/df

In [None]:
vect = TfidfVectorizer()
pd.DataFrame(vect.fit_transform(simple_train).toarray(), columns=vect.get_feature_names())

In [None]:
review = TextBlob(superheroes_evaluations.powers_text[30])

In [None]:
print (review)

In [None]:
review.sentiment.polarity

In [None]:
def detect_sentiment(text):
    return TextBlob(text).sentiment.polarity

In [None]:
superheroes['sentiment'] = superheroes.powers_text.apply(detect_sentiment)

In [None]:
superheroes.boxplot(column='sentiment', by='intelligence_score')

In [None]:
superheroes['sentiment'].hist() 

In [None]:
superheroes[superheroes.sentiment == 1].text.head()

In [None]:
superheroes[superheroes.sentiment == -1].text.head()

In [None]:
pd.set_option('max_colwidth', 500)
superheroes[(superheroes.intelligence_score == 75) & (superheroes.sentiment < -0.3)].head(1)

In [None]:
superheroes[(superheroes.intelligence_score == 75) & (superheroes.sentiment > 0.5)].head(1)