In [None]:
#.......... for data .................
import pandas as pd
import numpy as np
import pickle
import string

#.......... for plotting ..............
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import wordcloud
from wordcloud import WordCloud, STOPWORDS

#.......... for text processing .......
import nltk
import html
from nltk import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

#.......... for sentiment ..............
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#.......... for vectorizer ..............
import sklearn
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn import feature_extraction
from sklearn.model_selection import StratifiedKFold

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import GridSearchCV 

#.............. EDA .......................
from collections import Counter

#.............. Imbalanced dataset ........
from imblearn.under_sampling import RandomUnderSampler 



# Dataset

In [None]:
df = pd.read_pickle('cleaned_tweets.pkl')
df # 581470

In [None]:
df_clean = df.copy()

In [None]:
df_clean.groupby(['label'])['user.screen_name'].nunique()

In [None]:
# Total: 581470 tweets
tweets = df_clean.groupby('label')['cleaned_text'].count().to_frame(name='count')
tweets['percentage'] = ((tweets['count'] / tweets['count'].sum()) * 100).round(1)
print(tweets)

fig = px.bar(tweets, y = "count", text="percentage", labels=dict(label="Extremist groups", count=""))
fig.update_layout(title="Percentage of tweets per group", title_x= 0.5, showlegend=False)
fig.show()

# 23.8% - 39.9% - 36.3%

## Data Imbalance

In [None]:
X = df_clean[['id', 'user.screen_name','text','cleaned_text_punc', 'cleaned_text']]
y = df_clean['label']

In [None]:
# instantiating the random undersampler
rus = RandomUnderSampler() 

# resampling X, y
X_rus, y_rus = rus.fit_resample(X, y)

# new class distribution
print(Counter(y_rus))

In [None]:
df_balanced = pd.concat([X_rus.reset_index(drop=True), y_rus.reset_index(drop=True)], axis=1)

In [None]:
# Total: 418938 tweets
tweets = df_balanced.groupby('label')['cleaned_text'].count().to_frame(name='count')
tweets['percentage'] = ((tweets['count'] / tweets['count'].sum()) * 100).round(1)
print(tweets)

fig = px.bar(tweets, y = "count", text="percentage", labels=dict(label="Extremist groups", count=""))
fig.update_layout(title="Percentage of tweets per group", title_x= 0.5, showlegend=False)
fig.show()


In [None]:
df_balanced.groupby(['label'])['user.screen_name'].nunique()

In [None]:
df_balanced.shape

# Sentiment

In [None]:
df_sentiment = df_balanced.copy()

In [None]:
#load VADER
analyzer = SentimentIntensityAnalyzer()

#Add VADER metrics to dataframe
df_sentiment['compound'] = [analyzer.polarity_scores(v)['compound'] for v in df_sentiment['cleaned_text']]
df_sentiment['neg'] = [analyzer.polarity_scores(v)['neg'] for v in df_sentiment['cleaned_text']]
df_sentiment['neu'] = [analyzer.polarity_scores(v)['neu'] for v in df_sentiment['cleaned_text']]
df_sentiment['pos'] = [analyzer.polarity_scores(v)['pos'] for v in df_sentiment['cleaned_text']]

df_sentiment.head()

In [None]:
df_sentiment['compound_sentiment'] = ['negative' if x <= -0.05 
                                     else 'positive' if x >= 0.05
                                     else 'neutral' for x in df_sentiment['compound']]

df_sentiment

In [None]:
df_sentiment.groupby(['label'])['compound_sentiment'].value_counts()

In [None]:
# Total: 612171 tweets

tweets = df_sentiment.groupby('label')['compound_sentiment'].value_counts().to_frame(name='count').reset_index()
tweets['percentage'] = ((tweets['count'] / tweets['count'].sum()) * 100).round(1)
print(tweets)
total = tweets['count'].sum()

sns.set_style("whitegrid")

ax = sns.barplot(x='label', y='count', hue='compound_sentiment', data=tweets,
                 palette="Blues_d")

plt.title("Distribution Sentiment scores per group")
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='right')
    
ax.legend(title='Sentiment', shadow=True)
ax.set(xlabel='Extremists', ylabel='Sentiment scores') 
plt.show()

In [None]:
## Sentiment Classifier

In [None]:
X = df_sentiment[['cleaned_text', 'compound']]
y = df_sentiment['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)


In [None]:
# Confusion Matrix
# Code from: https://github.com/DTrimarchi10/confusion_matrix/blob/master/cf_matrix.py
def make_confusion_matrix(cf,
                          group_names=None,
                          categories='auto',
                          count=True,
                          percent=True,
                          cbar=True,
                          xyticks=True,
                          xyplotlabels=True,
                          sum_stats=True,
                          figsize=None,
                          cmap='Blues',
                          title=None):
   

    # CODE TO GENERATE TEXT INSIDE EACH SQUARE
    blanks = ['' for i in range(cf.size)]

    if group_names and len(group_names)==cf.size:
        group_labels = ["{}\n".format(value) for value in group_names]
    else:
        group_labels = blanks

    if count:
        group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
    else:
        group_counts = blanks

    if percent:
        group_percentages = ["{0:.2%}".format(value) for value in cf.flatten()/np.sum(cf)]
    else:
        group_percentages = blanks

    box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels,group_counts,group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])


    # CODE TO GENERATE SUMMARY STATISTICS & TEXT FOR SUMMARY STATS
    if sum_stats:
        #Accuracy is sum of diagonal divided by total observations
        accuracy  = np.trace(cf) / float(np.sum(cf))

        #if it is a binary confusion matrix, show some more stats
        if len(cf)==2:
            #Metrics for Binary Confusion Matrices
            precision = cf[1,1] / sum(cf[:,1])
            recall    = cf[1,1] / sum(cf[1,:])
            f1_score  = 2*precision*recall / (precision + recall)
            stats_text = "\n\nAccuracy={:0.3f}\nPrecision={:0.3f}\nRecall={:0.3f}\nF1 Score={:0.3f}".format(
                accuracy,precision,recall,f1_score)
        else:
            stats_text = "\n\nAccuracy={:0.3f}".format(accuracy)
    else:
        stats_text = ""


    # SET FIGURE PARAMETERS ACCORDING TO OTHER ARGUMENTS
    if figsize==None:
        #Get default figure size if not set
        figsize = plt.rcParams.get('figure.figsize')

    if xyticks==False:
        #Do not show categories if xyticks is False
        categories=False


    # MAKE THE HEATMAP VISUALIZATION
    plt.figure(figsize=figsize)
    sns.heatmap(cf,annot=box_labels,fmt="",cmap=cmap,cbar=cbar,xticklabels=categories,yticklabels=categories)

    if xyplotlabels:
        plt.ylabel('True label')
        plt.xlabel('Predicted label' + stats_text)
    else:
        plt.xlabel(stats_text)
    
    if title:
        plt.title(title)

In [None]:
class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [None]:
senti_feats =  Pipeline([
                ('selector', NumberSelector(key='compound')),
                ('standard', MinMaxScaler()),
            ])


In [None]:
pipeline = Pipeline([
    ('senti_feats', senti_feats),
    ('clf', MultinomialNB())
])

pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
print('accuracy_score: ', np.mean(preds == y_test))
print(classification_report(preds, y_test)) # 0.33

In [None]:
matrix = confusion_matrix(preds, y_test)
categories = ['LWE', 'NE', 'RWE']
make_confusion_matrix(matrix, categories=categories)

In [None]:
pipeline = Pipeline([
    ('senti_feats', senti_feats),
    ('clf', SGDClassifier())
])

pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
print('accuracy_score: ', np.mean(preds == y_test))
print(classification_report(preds, y_test)) # 0.42

In [None]:
matrix = confusion_matrix(preds, y_test)
categories = ['LWE', 'NE', 'RWE']
make_confusion_matrix(matrix, categories=categories)

# NLP Classifiers

In [None]:
X = df_balanced['cleaned_text']
y = df_balanced['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify= y, random_state=100)



In [None]:
print(X_train.shape) # 335150 - training_features
print(X_test.shape) # 83788 - test features


#......................................................
print(y_train.shape) # training target
print(y_test.shape) # test target


In [None]:
df_train = pd.concat([X_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
df_test = pd.concat([X_test.reset_index(drop=True), y_test.reset_index(drop=True)], axis=1)


In [None]:
# label encode the target variable
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

In [None]:
# Confusion Matrix: https://github.com/DTrimarchi10/confusion_matrix/blob/master/cf_matrix.py
def make_confusion_matrix(cf,
                          group_names=None,
                          categories='auto',
                          count=True,
                          percent=True,
                          cbar=True,
                          xyticks=True,
                          xyplotlabels=True,
                          sum_stats=True,
                          figsize=None,
                          cmap='Blues',
                          title=None):
   

    # CODE TO GENERATE TEXT INSIDE EACH SQUARE
    blanks = ['' for i in range(cf.size)]

    if group_names and len(group_names)==cf.size:
        group_labels = ["{}\n".format(value) for value in group_names]
    else:
        group_labels = blanks

    if count:
        group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
    else:
        group_counts = blanks

    if percent:
        group_percentages = ["{0:.2%}".format(value) for value in cf.flatten()/np.sum(cf)]
    else:
        group_percentages = blanks

    box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels,group_counts,group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])


    # CODE TO GENERATE SUMMARY STATISTICS & TEXT FOR SUMMARY STATS
    if sum_stats:
        #Accuracy is sum of diagonal divided by total observations
        accuracy  = np.trace(cf) / float(np.sum(cf))

        #if it is a binary confusion matrix, show some more stats
        if len(cf)==2:
            #Metrics for Binary Confusion Matrices
            precision = cf[1,1] / sum(cf[:,1])
            recall    = cf[1,1] / sum(cf[1,:])
            f1_score  = 2*precision*recall / (precision + recall)
            stats_text = "\n\nAccuracy={:0.3f}\nPrecision={:0.3f}\nRecall={:0.3f}\nF1 Score={:0.3f}".format(
                accuracy,precision,recall,f1_score)
        else:
            stats_text = "\n\nAccuracy={:0.3f}".format(accuracy)
    else:
        stats_text = ""


    # SET FIGURE PARAMETERS ACCORDING TO OTHER ARGUMENTS
    if figsize==None:
        #Get default figure size if not set
        figsize = plt.rcParams.get('figure.figsize')

    if xyticks==False:
        #Do not show categories if xyticks is False
        categories=False


    # MAKE THE HEATMAP VISUALIZATION
    plt.figure(figsize=figsize)
    sns.heatmap(cf,annot=box_labels,fmt="",cmap=cmap,cbar=cbar,xticklabels=categories,yticklabels=categories)

    if xyplotlabels:
        plt.ylabel('True label')
        plt.xlabel('Predicted label' + stats_text)
    else:
        plt.xlabel(stats_text)
    
    if title:
        plt.title(title)

## Baseline Model

In [None]:
# create a count vectorizer
count_vect = CountVectorizer(max_features=1000)

# fit and transform train set. Using fit: learning vocabulary dictionary
xtrain_count = count_vect.fit_transform(X_train)

# transform test set
xtest_count = count_vect.transform(X_test)

# fit the training dataset on the NB classifier
dummy = MultinomialNB()
dummy.fit(xtrain_count, y_train)

# predict the labels on validation dataset
pred_dummy = dummy.predict(xtest_count)
accuracy_dummy = accuracy_score(pred_dummy, y_test)

# Use accuracy_score function to get the accuracy and print classification report
print('accuracy %s' % accuracy_dummy) 
print(classification_report(pred_dummy, y_test)) # 

## Naive Bayes + BOW

In [None]:
X, y = df_balanced['cleaned_text'], df_balanced['label']
metrics = []

skf = StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    nb_bow = Pipeline([('vect', CountVectorizer(analyzer='word', max_features=5000, stop_words='english')),
                       ('clf_nb', MultinomialNB()),
                      ])
    
    nb_bow.fit(X_train, y_train)
    pred_nb_bow = nb_bow.predict(X_test)
    
    metrics.append(accuracy_score(pred_nb_bow, y_test))

metrics = np.array(metrics)
accuracy_nb_bow = np.mean(metrics, axis=0).round(3)

print('Mean accuracy: ', accuracy_nb_bow)
print(classification_report(pred_nb_bow, y_test))   

In [None]:
matrix_nb_bow = confusion_matrix(pred_nb_bow, y_test)
matrix_nb_bow

In [None]:
categories = ['LWE', 'NE', 'RWE']
make_confusion_matrix(matrix_nb_bow, categories=categories)


## Naive Bayes + BOW - Optimization

In [None]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (2,2)], # choice whether unigram or bigram
              'vect__max_features': (None, 5000, 10000, 50000),
              'vect__max_df': (0.5, 0.75, 1.0),
              'clf_nb__alpha': (1e-2, 1e-3),
              'clf_nb__fit_prior': (True, False),
             }

# create instance of grid search by passing classifier
gs_clf = GridSearchCV(nb_bow, parameters, n_jobs=-1, scoring='accuracy')
gs_clf = gs_clf.fit(X_train, y_train)

gs_clf.best_score_
gs_clf.best_params_ # 1,2, Fit_prior=False meaning uniform prior used, 0.01

In [None]:
#%%time
y_pred = gs_clf.predict(X_test)
accuracy_nb_bow_grid = accuracy_score(y_pred, y_test).round(3)
df_class_report_nb_bow_grid = classification_report(y_pred, y_test, output_dict=True, digits=3)
df_class_report_nb_bow_grid = pd.DataFrame(df_class_report_nb_bow_grid).transpose()
print('accuracy %s' % accuracy_nb_bow_grid) 


In [None]:
matrix_nb_bow = confusion_matrix(y_pred, y_test)
matrix_nb_bow

In [None]:
categories = ['LWE', 'NE', 'RWE']
matrix_nb_bow_grid = make_confusion_matrix(matrix_nb_bow, categories=categories)
matrix_nb_bow_grid

## Naive Bayes + TFIDF

In [None]:
X, y = df_balanced['cleaned_text'], df_balanced['label']
metrics = []

skf = StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    nb_tfidf = Pipeline([('vect', CountVectorizer(analyzer='word', max_features=5000, stop_words='english')),
                         ('tfidf', TfidfTransformer()),
                         ('clf_nb', MultinomialNB()),
                        ])
    
    nb_tfidf.fit(X_train, y_train)
    pred_nb_tfidf = nb_tfidf.predict(X_test)
    
    metrics.append(accuracy_score(pred_nb_tfidf, y_test))

metrics = np.array(metrics)
accuracy_nb_tfidf = np.mean(metrics, axis=0).round(3)

print('Mean accuracy: ', accuracy_nb_tfidf)
print(classification_report(pred_nb_tfidf, y_test))   

In [None]:
matrix_nb_tfidf = confusion_matrix(pred_nb_tfidf, y_test)
matrix_nb_tfidf


In [None]:
categories = ['LWE', 'NE', 'RWE']
make_confusion_matrix(matrix_nb_tfidf, categories=categories)

## Naive Bayes + TFIDF - Optimization

In [None]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (2,2)], # choice whether unigram or bigram
              'vect__max_features': (None, 5000, 10000, 50000),
              'vect__max_df': (0.5, 0.75, 1.0),
              'tfidf__use_idf': (True, False),
              'clf_nb__alpha': (1e-2, 1e-3),
              'clf_nb__fit_prior': (True, False),
             }

# create instance of grid search by passing classifier
gs_clf = GridSearchCV(nb_tfidf, parameters, n_jobs=-1, scoring='accuracy')
gs_clf = gs_clf.fit(X_train, y_train)

gs_clf.best_score_
gs_clf.best_params_ # 1,2, None, 0.5, False, 0.01


In [None]:
y_pred = gs_clf.predict(X_test)
accuracy_nb_tfidf_grid = accuracy_score(y_pred, y_test).round(3)
df_class_report_nb_tfidf_grid = classification_report(y_pred, y_test, output_dict=True, digits=3)
df_class_report_nb_tfidf_grid = pd.DataFrame(df_class_report_nb_tfidf_grid).transpose()
print('accuracy %s' % accuracy_nb_tfidf_grid) # 

In [None]:
df_class_report_nb_tfidf_grid.to_pickle("df_class_report_nb_tfidf_grid.pkl")
print(df_class_report_nb_tfidf_grid) # 

In [None]:
matrix_nb_tfidf = confusion_matrix(y_pred, y_test)
matrix_nb_tfidf


In [None]:
categories = ['LWE', 'NE', 'RWE']
matrix_nb_tfidf_grid = make_confusion_matrix(matrix_nb_tfidf, categories=categories)
matrix_nb_tfidf_grid

## SVM + BOW

In [None]:
X, y = df_balanced['cleaned_text'], df_balanced['label']
metrics = []

skf = StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    sgd_bow = Pipeline([('vect', CountVectorizer(analyzer='word', max_features=5000, stop_words='english')),
                        ('clf_svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42)),
                       ])
    
    sgd_bow.fit(X_train, y_train)
    pred_svm_bow = sgd_bow.predict(X_test)
    
    metrics.append(accuracy_score(pred_svm_bow, y_test))

metrics = np.array(metrics)
accuracy_svm_bow = np.mean(metrics, axis=0).round(3)

print('Mean accuracy: ', accuracy_svm_bow)
print(classification_report(pred_svm_bow, y_test))   

In [None]:
matrix_svm_bow = confusion_matrix(pred_svm_bow, y_test)
matrix_svm_bow


In [None]:
categories = ['LWE', 'NE', 'RWE']
fig = make_confusion_matrix(matrix_svm_bow, categories=categories)

## SVM + BOW - Optimization

In [None]:
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2), (2,2)],
                  'vect__max_features': (None, 5000, 10000, 50000),
                  'vect__max_df': (0.5, 0.75, 1.0),
                  'clf_svm__alpha': (1e-2, 1e-3),
                 }

gs_clf_svm = GridSearchCV(sgd_bow, parameters_svm, n_jobs=-1, scoring='accuracy')
gs_clf_svm = gs_clf_svm.fit(X_train, y_train)
gs_clf_svm.best_score_
gs_clf_svm.best_params_ # 1.2, None, 1.0, 0.001

In [None]:
#%%time
y_pred = gs_clf_svm.predict(X_test)
accuracy_svm_bow_grid = accuracy_score(y_pred, y_test).round(3)
df_class_report_svm_bow_grid = classification_report(y_pred, y_test, output_dict=True, digits=3)
df_class_report_svm_bow_grid = pd.DataFrame(df_class_report_svm_bow_grid).transpose()

print('accuracy %s' % accuracy_svm_bow_grid) 


In [None]:
matrix_svm_bow = confusion_matrix(y_pred, y_test)
matrix_svm_bow


In [None]:
categories = ['LWE', 'NE', 'RWE']
matrix_svm_bow_grid = make_confusion_matrix(matrix_svm_bow, categories=categories)
matrix_svm_bow_grid

## SVM + TFIDF

In [None]:
X, y = df_balanced['cleaned_text'], df_balanced['label']
metrics = []

skf = StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    sgd_tfidf = Pipeline([('vect', CountVectorizer(analyzer='word', max_features=5000, stop_words='english')),
                          ('tfidf', TfidfTransformer()),
                          ('clf_svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42)),
                         ])
    
    sgd_tfidf.fit(X_train, y_train)
    pred_svm_tfidf = sgd_tfidf.predict(X_test)
    
    metrics.append(accuracy_score(pred_svm_tfidf, y_test))

metrics = np.array(metrics)
accuracy_svm_tfidf = np.mean(metrics, axis=0).round(3)

print('Mean accuracy: ', accuracy_svm_tfidf)
print(classification_report(pred_svm_tfidf, y_test))   

In [None]:
matrix_svm_tfidf = confusion_matrix(pred_svm_tfidf, y_test)
matrix_svm_tfidf


In [None]:
categories = ['LWE', 'NE', 'RWE']
make_confusion_matrix(matrix_svm_tfidf, categories=categories)

## SVM + TFIDF - Optimization

In [None]:
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2), (2,2)],
                  'vect__max_features': (None, 5000, 10000, 50000),
                  'vect__max_df': (0.5, 0.75, 1.0),
                  'tfidf__use_idf': (True, False),
                  'clf_svm__alpha': (1e-2, 1e-3),
                 }

gs_clf_svm = GridSearchCV(sgd_tfidf, parameters_svm, n_jobs=-1, scoring='accuracy')
gs_clf_svm = gs_clf_svm.fit(X_train, y_train)
gs_clf_svm.best_score_
gs_clf_svm.best_params_ # 2.2, None, 0.75, True, 0.01

In [None]:
#%%time
y_pred = gs_clf_svm.predict(X_test)
accuracy_svm_tfidf_grid = accuracy_score(y_pred, y_test).round(3)
df_class_report_svm_tfidf_grid = classification_report(y_pred, y_test, output_dict=True, digits=3)
df_class_report_svm_tfidf_grid = pd.DataFrame(df_class_report_svm_tfidf_grid).transpose()

print('accuracy %s' % accuracy_svm_tfidf_grid) 

In [None]:
df_class_report_svm_tfidf_grid.to_pickle("df_class_report_svm_tfidf_grid.pkl")
print(df_class_report_svm_tfidf_grid) # 76% > 77.6%

In [None]:
matrix_svm_tfidf = confusion_matrix(y_pred, y_test)
matrix_svm_tfidf


In [None]:
categories = ['LWE', 'NE', 'RWE']
matrix_svm_tfidf_grid = make_confusion_matrix(matrix_svm_tfidf, categories=categories)
matrix_svm_tfidf_grid

# Results

In [None]:
print("NB Count Vectors:       ", accuracy_nb_bow, "   | ", accuracy_nb_bow_grid) # 0.73 - 0.80
print("NB Tfidf:               ", accuracy_nb_tfidf, " | ", accuracy_nb_tfidf_grid) # 0.74 - 0.81

print("--------------------------------------------------------------------------")
print("SVM Count Vectors:      ", accuracy_svm_bow, "   | ", accuracy_svm_bow_grid) # 0.77 - 0.77
print("SVM Tfidf:              ", accuracy_svm_tfidf, " | ", accuracy_svm_tfidf_grid) # 0.76 - 0.74