In [1]:
import pandas as pd
import numpy as np
import spacy

from statistics import mean, stdev
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt

from sklearn.preprocessing import binarize

In [2]:
data = pd.read_csv("project_18_dataset_combined.csv")
data = data[['label', 'text']]

#set display option
pd.set_option('display.max_colwidth', None)

#make target labels boolean
data['label']=data['label'].apply(lambda x: 1 if x == 14 else 0)

In [3]:
print(data.shape)
print(data.label.value_counts())
data.head()

(19821, 2)
0    18585
1     1236
Name: label, dtype: int64


Unnamed: 0,label,text
0,0,"Serious Youth Violence crimes, lone wolves basically."
1,0,Yesterday?
2,0,"I would argue with you, but it seems pointless. If I understand you, your argument is this: the UK can do whatever it wants the UK is never wrong"
3,0,"the whole thing is pretty dumb desu, school kids could be in a class of 12 or whatever all mingling and thats fine, but if they leave school and they all go to the park then thats not not allowed"
4,0,"I fucking hope you re right, lad."


In [4]:
#Balancing data
count_label_0, count_label_1 = data.label.value_counts()

data_label_1 = data[data['label'] == 1]
data_label_0 = data[data['label'] == 0]

data_label_0_b = data_label_0.sample(count_label_1, random_state=88)
data_b = pd.concat([data_label_0_b, data_label_1])
print(data_b.shape)
print(data_b.label.value_counts())
data_b.head()

(2472, 2)
0    1236
1    1236
Name: label, dtype: int64


Unnamed: 0,label,text
17921,0,pointing out the ruse is lower IQ than the people who just sit back and enjoy the show
17572,0,I am in bed dingus I am going to sleep Do not summon me again
15394,0,boy he just has the worst opinions about everything. But is he also the anti-awoo beaner?
19111,0,DAY OF THE RAKE!
189,0,Is it Constantine filming from the hell dimension?


In [5]:
!pip3 install -U spacy



In [6]:
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[K     |████████████████████████████████| 13.9 MB 128 kB/s eta 0:00:01
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.2.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [7]:
#lemmatizing
nlp = spacy.load('en_core_web_sm')
data_b['text_lemmatized'] = data_b['text'].apply(lambda x: " ".join([words.lemma_ for words in nlp(x)]))

In [8]:
#train/test splitting again
X_train, X_test, y_train, y_test = train_test_split(data_b.text_lemmatized, data_b.label, test_size=0.25, random_state=14, stratify=data_b.label)

In [9]:
#making pipeline
tfid_pipeline = Pipeline([ ('vectorizer', TfidfVectorizer()), ('classifier', MultinomialNB()) ])

In [10]:
scoring = {'Precision': 'precision', 'Recall' : 'recall', 'Accuracy' : 'accuracy', 'AUC' : 'roc_auc'}

grid = {'vectorizer__lowercase': [True, False],
        'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
        'vectorizer__max_df': [1.0, 0.90, 0.80, 0.70, 0.60, 0.50, 0.40, 0.30],
        'vectorizer__min_df': [1, 2, 3, 4, 5, 10],
        'classifier__alpha': [0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'classifier__fit_prior': [True, False],
        'vectorizer__use_idf': [True, False],
        'vectorizer__smooth_idf': [True, False],
        'vectorizer__sublinear_tf': [True, False]}

In [11]:
grid_search = GridSearchCV(tfid_pipeline, param_grid=grid, scoring=scoring, refit='Precision', return_train_score=True, verbose=1, cv=5, n_jobs=64)

In [12]:
grid_search.fit(data_b.text_lemmatized, data_b.label)
tfid_results = grid_search.cv_results_

Fitting 5 folds for each of 149760 candidates, totalling 748800 fits


In [13]:
tfid_results

{'mean_fit_time': array([0.16105528, 0.13839927, 0.15151415, ..., 1.05229406, 1.06354675,
        1.06212482]),
 'std_fit_time': array([0.01989433, 0.0036285 , 0.00765256, ..., 0.12628642, 0.02769325,
        0.04732988]),
 'mean_score_time': array([0.06838503, 0.06579461, 0.06731892, ..., 0.19018817, 0.17153616,
        0.16820436]),
 'std_score_time': array([0.00438221, 0.00298885, 0.00412855, ..., 0.05575165, 0.04318068,
        0.03586024]),
 'param_classifier__alpha': masked_array(data=[0.01, 0.01, 0.01, ..., 1.0, 1.0, 1.0],
              mask=[False, False, False, ..., False, False, False],
        fill_value='?',
             dtype=object),
 'param_classifier__fit_prior': masked_array(data=[True, True, True, ..., False, False, False],
              mask=[False, False, False, ..., False, False, False],
        fill_value='?',
             dtype=object),
 'param_vectorizer__lowercase': masked_array(data=[True, True, True, ..., False, False, False],
              mask=[False, False

In [14]:
print(grid_search.best_score_)
print(grid_search.best_params_)

0.7589193644447896
{'classifier__alpha': 0.05, 'classifier__fit_prior': False, 'vectorizer__lowercase': True, 'vectorizer__max_df': 0.6, 'vectorizer__min_df': 2, 'vectorizer__ngram_range': (1, 6), 'vectorizer__smooth_idf': True, 'vectorizer__sublinear_tf': True, 'vectorizer__use_idf': False}


In [15]:
tfid_df = pd.DataFrame(grid_search.cv_results_)
tfid_report = tfid_df[['params', 'mean_test_Precision','std_test_Precision', 'mean_test_Recall', 'mean_test_Accuracy', 'mean_test_AUC']]
tfid_report.sort_values('mean_test_Precision', ascending=False)

Unnamed: 0,params,mean_test_Precision,std_test_Precision,mean_test_Recall,mean_test_Accuracy,mean_test_AUC
30045,"{'classifier__alpha': 0.05, 'classifier__fit_prior': False, 'vectorizer__lowercase': True, 'vectorizer__max_df': 0.6, 'vectorizer__min_df': 2, 'vectorizer__ngram_range': (1, 6), 'vectorizer__smooth_idf': False, 'vectorizer__sublinear_tf': True, 'vectorizer__use_idf': False}",0.758919,0.031609,0.715212,0.744715,0.812758
30041,"{'classifier__alpha': 0.05, 'classifier__fit_prior': False, 'vectorizer__lowercase': True, 'vectorizer__max_df': 0.6, 'vectorizer__min_df': 2, 'vectorizer__ngram_range': (1, 6), 'vectorizer__smooth_idf': True, 'vectorizer__sublinear_tf': True, 'vectorizer__use_idf': False}",0.758919,0.031609,0.715212,0.744715,0.812758
29753,"{'classifier__alpha': 0.05, 'classifier__fit_prior': False, 'vectorizer__lowercase': True, 'vectorizer__max_df': 0.7, 'vectorizer__min_df': 2, 'vectorizer__ngram_range': (1, 6), 'vectorizer__smooth_idf': True, 'vectorizer__sublinear_tf': True, 'vectorizer__use_idf': False}",0.758911,0.030434,0.714402,0.744313,0.812637
29177,"{'classifier__alpha': 0.05, 'classifier__fit_prior': False, 'vectorizer__lowercase': True, 'vectorizer__max_df': 0.9, 'vectorizer__min_df': 2, 'vectorizer__ngram_range': (1, 6), 'vectorizer__smooth_idf': True, 'vectorizer__sublinear_tf': True, 'vectorizer__use_idf': False}",0.758911,0.030434,0.714402,0.744313,0.812637
29757,"{'classifier__alpha': 0.05, 'classifier__fit_prior': False, 'vectorizer__lowercase': True, 'vectorizer__max_df': 0.7, 'vectorizer__min_df': 2, 'vectorizer__ngram_range': (1, 6), 'vectorizer__smooth_idf': False, 'vectorizer__sublinear_tf': True, 'vectorizer__use_idf': False}",0.758911,0.030434,0.714402,0.744313,0.812637
...,...,...,...,...,...,...
141713,"{'classifier__alpha': 1.0, 'classifier__fit_prior': True, 'vectorizer__lowercase': False, 'vectorizer__max_df': 0.8, 'vectorizer__min_df': 1, 'vectorizer__ngram_range': (1, 3), 'vectorizer__smooth_idf': True, 'vectorizer__sublinear_tf': True, 'vectorizer__use_idf': False}",0.549201,0.011466,0.974918,0.586965,0.822660
141425,"{'classifier__alpha': 1.0, 'classifier__fit_prior': True, 'vectorizer__lowercase': False, 'vectorizer__max_df': 0.9, 'vectorizer__min_df': 1, 'vectorizer__ngram_range': (1, 3), 'vectorizer__smooth_idf': True, 'vectorizer__sublinear_tf': True, 'vectorizer__use_idf': False}",0.549201,0.011466,0.974918,0.586965,0.822660
142001,"{'classifier__alpha': 1.0, 'classifier__fit_prior': True, 'vectorizer__lowercase': False, 'vectorizer__max_df': 0.7, 'vectorizer__min_df': 1, 'vectorizer__ngram_range': (1, 3), 'vectorizer__smooth_idf': True, 'vectorizer__sublinear_tf': True, 'vectorizer__use_idf': False}",0.549201,0.011466,0.974918,0.586965,0.822660
142005,"{'classifier__alpha': 1.0, 'classifier__fit_prior': True, 'vectorizer__lowercase': False, 'vectorizer__max_df': 0.7, 'vectorizer__min_df': 1, 'vectorizer__ngram_range': (1, 3), 'vectorizer__smooth_idf': False, 'vectorizer__sublinear_tf': True, 'vectorizer__use_idf': False}",0.549201,0.011466,0.974918,0.586965,0.822660


In [None]:
tfid_report.to_csv('report_new.csv')