# Import Packages

In [9]:
import pandas as pd
import numpy as np
from collections import defaultdict
import re
import json
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import preprocessing
from gensim.models.word2vec import Word2Vec

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split



# Import Cleaned Data

In [2]:
data_clean = pd.read_csv('data_clean.csv')
data_clean.sample(frac=1)
Characters_per_rank = (data_clean.Characters.value_counts()).index 
data_clean = data_clean[:20000]
data_clean.shape

(20000, 6)

In [3]:
# Select and Label Dataset for Modeling

In [4]:
#truncate dataset based on number of characters we want to verify
threshold = 6
data_thres = data_clean[data_clean.Characters.isin(Characters_per_rank[:threshold])]

# split the clean text in the rows  into list of words
tokenized_data = data_thres["Lines"].apply(lambda text: re.split(' ',text))

# Label Character
label = preprocessing.LabelEncoder()
target = label.fit_transform(data_clean['Characters'].astype(str))
data_thres['target'] = label.fit_transform(data_thres['Characters'].astype(str))
target_thres = data_thres['target'] 
my_tags = Characters_per_rank[0:threshold]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [None]:
data_thres.head()

# Feature Engineering using Bag of Words and Word2Vec Embedding

## Count Vectorizer

In [6]:
# Create a BoW with Count Vectorizer 


def count_vectorizer(data):
    vectorizer = CountVectorizer(ngram_range = (1,2), min_df=10)
    # call `fit` to build the vocabulary
    vectorizer.fit(data)

    # call `transform` to convert text to a bag of words
    count_vectorizer_result  = vectorizer.transform(data)

    #convert to a numpy array to visualize as dataframe
    count_vectorizer_result = count_vectorizer_result.toarray()
    count_vectorizer_features = pd.DataFrame(count_vectorizer_result , columns = vectorizer.get_feature_names())
    count_vectorizer_features.index = data.index
    return count_vectorizer_features

count_vectorizer_df = count_vectorizer(data_thres['Lines'])
count_vectorizer_df.shape

(16889, 1749)

## TFIDF

In [7]:
# Create a BoW with TF-IDF Scheme 

def tfidf(data):
    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2))

    tfidf_result = tfidf.fit_transform(data).toarray()
    tfidf_features = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names())
    tfidf_features.index = data.index
    return tfidf_features


tfidf_df = tfidf(data_thres['Lines'])
tfidf_df.shape

(16889, 3887)

## Word2Vec with TFIDF weighting scheme

In [None]:
# Define a class to impolement word2vec averaged with tf-idf weighting scheme

class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(word2vec.wv.syn0[0])

    def fit(self, X, y = None):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec.wv[w] * self.word2weight[w]
                         for w in words if w in self.word2vec.wv] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [None]:

# let X be a list of tokenized texts (i.e. list of lists of tokens)
model = Word2Vec(tokenized_data,min_count=2,size=100,window=5,iter=100)

tfidf_word2vec = TfidfEmbeddingVectorizer(model)
tfidf_word2vec.fit(tokenized_data)
tfidf_doc_vec = tfidf_word2vec.transform(tokenized_data)

In [None]:
model.similarity('dream','realize')

## Word2Vec with Simple Averaging Scheme

In [None]:
# Define a class to impolement word2vec with simple averaging scheme
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.wv.syn0[0])

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [None]:
word_model = Word2Vec(tokenized_data,min_count=2,size=100,window=5,iter=100)
mean_vec_tr = MeanEmbeddingVectorizer(word_model)
doc_vec = mean_vec_tr.transform((tokenized_data))

# ML Modeling Performance with Different BoW\ Word Embedding Schemes

In [None]:


Eval_score_embedding = {'Count Vectorizer': [], 'TF-IDF':[], 'Word2Vec':[], 'Word2Vec with TF-IDF Weighting':[] }
name_embedded_dataframes = {'Count Vectorizer': count_vectorizer_df,'TF-IDF': tfidf_df, 'Word2Vec': doc_vec, 'Word2Vec with TF-IDF Weighting':tfidf_doc_vec}

def word_embedding_performance_eval(clf,dataframe):
    Eval_score =  cross_val_score(clf,dataframe, target_thres, cv=5)
    return  Eval_score.mean() 
    
model =  LogisticRegression(max_iter = 1e6)
for dataframe in name_embedded_dataframes.keys():
    Eval_score_embedding [dataframe] = word_embedding_performance_eval(model,name_embedded_dataframes[dataframe])

In [None]:
Embedding_Results = pd.DataFrame.from_dict(Eval_score_embedding,orient='index')
Embedding_Results.index.name = 'BoW/Word Embedding Method'
Embedding_Results.columns = ['Accuracy'] 
Embedding_Results.sort_values(by = "Accuracy", ascending =False)

## Optimize Logistic Regression for Best BOW/ Word Embedding Approach

In [None]:

model =  LogisticRegression(max_iter = 1e3)
params_log = { "C": [1e-2,1, 10, 1e2,1e6]}

#grid_log = GridSearchCV(model, param_grid=params_log, cv=5)
#grid_log.fit(tfidf_df,target_thres)

#print(grid_log.best_score_)
#print(grid_log.best_params_)

# Prediction Accuracy of Other ML Models with TF-IDF

## Naive Bayes with Count Vectorization and TF-IDF

- Accuracy is low when number of characters considered are higher (data is imbalanced, boot strapping may help)

In [None]:

model = MultinomialNB()
print(cross_val_score(model,count_vectorizer_df, target_thres, cv=5).mean())
print(cross_val_score(model,tfidf_df, target_thres, cv=5).mean())

In [None]:
# GridSearch with Naive Bayes
def gridsearchNB(clf, X, y):

    #the grid of parameters to search over
    alphas = [0.001,0.01,.1, 1, 5, 10, 50]

    #Find the best value for alpha and min_df, and the best classifier
    best_alpha = None
    maxscore=-np.inf
    for alpha in alphas:        
        clf = MultinomialNB(alpha=alpha)
        cvscore = max(cross_val_score(clf, X,y,cv = 5))
        if cvscore > maxscore:
                maxscore = cvscore 
                best_alpha = alpha
    return  best_alpha, maxscore

print("Best Case with Count Vectorizer: ", gridsearchNB(model,count_vectorizer_df, target_thres))
print("Best Case with TF-IDF: ", gridsearchNB(model,tfidf_df,target_thres))

## Support Vector Machine: 

In [None]:
model_svc =SVC()
print('Accuracy of Support Vector Machine without Gridsearch:', cross_val_score(model_svc,tfidf_df, target_thres, cv=5).mean())

In [None]:
Cs = [0.001, 0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
#kernels =  ['rbf','linear']
params_svc = {'C': Cs, 'gamma' : gammas}
grid_svc = GridSearchCV(SVC(), param_grid = params_svc, cv=5)
grid_svc.fit(tfidf_df,target_thres)
print('Optimized Modeling Paramters for Support vector Machine', grid_svm.best_params_)
print('Accuracy of Support Vector Machine with Gridsearch:', grid_svm.best_score_)

# Overall ROC_AUC for Different Models

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

def roc_auc_classifier(x,y,model):
    
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2,random_state =42)
    model.fit(x_train,y_train)
    y_prob = model.predict_proba(x_test)
    
    macro_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovr",average="macro")
    
    return  macro_roc_auc_ovo 
    
models = {'Classifier': [SVC( probability = True),MultinomialNB(alpha=1), LogisticRegression(max_iter = 1e6)], 'ROC_AUC_Score':[],'Classifier_Name': ['Support Vector Machines','Naive_Bayes','Logistic_Regression']}


for model in models['Classifier']:
    roc_auc_score  = roc_auc_classifier(tfidf_df, target_thres,model)
    models['ROC_AUC_Score'].append(  roc_auc_score  )
print('Classifiers:', models['Classifier_Name'])
print('ROC_AUC Scores:', models['ROC_AUC_Score'])

# Class Specific ROC_AUC for Imabalanced Data

In [None]:
 %%time

import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from sklearn.metrics import roc_auc_score

y= label_binarize(target_thres, classes = list(set(target_thres)))
n_classes = y.shape[1]
# split training and test sets
x_train, x_test, y_train, y_test = train_test_split(tfidf_df, y, test_size=.5,
                                                    random_state=0)

In [None]:
 %%time

import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from sklearn.metrics import roc_auc_score

y= label_binarize(target_thres, classes = list(set(target_thres)))
n_classes = y.shape[1]
# split training and test sets
x_train, x_test, y_train, y_test = train_test_split(tfidf_df, y, test_size=.5,
                                                    random_state=0)

# Learn to predict each class against the other
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
                                 random_state=0))
y_score = classifier.fit(x_train, y_train).decision_function(x_test)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()


for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))


In [None]:

plt.figure(figsize = (16,6))
lw=2
colors = cycle(['aqua', 'darkorange', 'cornflowerblue','deeppink', 'red', 'green','gold','purple','olivedrab','mediumslateblue','gray','lawngreen'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
                 label='ROC curve of {0} (area = {1:0.2f})'
                 ''.format(my_tags[i], roc_auc[i]))
        
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate',FontSize = 16)
plt.ylabel('True Positive Rate',FontSize = 16)
plt.title('Some extension of Receiver operating characteristic to multi-class',FontSize = 16)
plt.legend(loc="lower right")
plt.show()

# Class Specific ROC_AUC for Oversampled Data

In [None]:
%%time

import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import RandomOverSampler

Sampling =  RandomOverSampler(random_state=0)

y= label_binarize(target_thres, classes = list(set(target_thres)))
n_classes = y.shape[1]
# split training and test sets
x_train, x_test, y_train, y_test = train_test_split(tfidf_df, y, test_size=.5,
                                                    random_state=0)
x_train, y_train = Sampling.fit_sample(x_train,  y_train)
print (x_train.shape)

# Learn to predict each class against the other
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
                                 random_state=0))
y_score = classifier.fit(x_train, y_train).decision_function(x_test)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))


In [None]:
plt.figure(figsize = (16,6))
lw=2
colors = cycle(['aqua', 'darkorange', 'cornflowerblue','deeppink', 'red', 'green','gold','purple','olivedrab','mediumslateblue','gray','lawngreen'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
                 label='ROC curve of {0} (area = {1:0.2f})'
                 ''.format(my_tags[i], roc_auc[i]))
        
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate',FontSize = 16)
plt.ylabel('True Positive Rate',FontSize = 16)
plt.title('Some extension of Receiver operating characteristic to multi-class',FontSize = 16)
plt.legend(loc="lower right")
plt.show()