In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import nltk
from nltk.tokenize import TreebankWordTokenizer
nltk.download("wordnet", quiet=True)
nltk.download('omw-1.4', quiet=True)
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords

STOPWORDS = stopwords.words('english')

from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
#from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import time


In [2]:
df = pd.read_csv('clean-data-LIMPO.csv')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
pd.set_option('display.max_colwidth', None)

In [3]:
df.head()

Unnamed: 0,comment,toxic,severe_toxic,obscene,threat,insult,identity_hate,class
0,Can you please explain \n\nhow i 'attacked' you and that Liz creep. I believe i have done nothing wrong and if anything you and that Liz creep are blackmailing me. Please after this leave me alone and stop harassing a 14 year old,0,0,0,0,0,0,0
1,"""\n I have edited numerous, probably more than a dozen, FA articles in the past. My experience on FA's is that they are far easier to edit after they have been removed from the Main Page. Once out of the """"spotlight"""", simple edits such as those I normally do aren't scrutinized so closely. Although, today with Irataba, I must admit I was extra bold—far bolder than I usually get with FAs. I am usually contented by correcting a typo or two. """,0,0,0,0,0,0,0
2,"I'd actually suggest WP:AN, I'd rather keep out of this one thanks ) TT(talk)",0,0,0,0,0,0,0
3,"I think your position is very weird. \n\n1. This is about English people, not about Americans. \n2. What do you imply that the majority of people in England descend from the 17th century?\n\nI will just left it here for other people to judge, but beleive me, your position is among the most weird ones that I have seen up to now.",0,0,0,0,0,0,0
4,"They're all around us, among us, in many cases we are the ones making promises that we never keep, unless for selfish reasons. The only end is usefulness, the means are every possibility, the stakes as high as can be, and winning is an imperative - and don't give anybody else a chance to participate. In the logic of this game, the only rule is being shrewd: no scruples, no respect for others because the last ones will remain the last if the first are inaccessible. They are many of them, arrogant with the weakest, servile with the powerful, they are replicants, they are all identical, look at them: they hide behind masks and they're indistinguishable. They climb upwards like lizards, and if they lose their tail, they buy a new one. They do what they want so that people will know what they've done: they spend money, spread money, they are what they own....",0,0,0,0,0,0,0


In [4]:
df['comment'] = [s.lower() for s in df.comment]

#### Train-test split


In [5]:
x = df["comment"]
y = df["class"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(x, y,test_size=0.2, random_state=100)

##### Passive Aggressive Classifier (default hyperparameters)


In [44]:
clf = {'PassiveAggressiveClassifier':PassiveAggressiveClassifier()}

In [45]:
def cross_val_report(clf_,X_,y_,cv_):
    '''This function gives cross-validated training accuracy for given classifier'''
    ti = time.time()
    # pipeline is created to vectorize, Tfidf transform and then classify 
    pipeline = Pipeline([('bow',CountVectorizer()), ('tfidf',TfidfTransformer()), ('clf',clf_)])
    acc_list = cross_val_score(pipeline, X_, y_, cv=cv_, scoring='accuracy')
    mean_ = round(100*acc_list.mean(),1)
    err_ = round(100*np.sqrt(acc_list.std()/(cv_-1)),1)
    accuracy_ = str(mean_)+' $\pm$ '+str(err_)
    
    tf = time.time()
    time_ = round(tf-ti,1)
    
    return accuracy_, time_

In [46]:
accuracy = []
time_taken = []

for i in range(len(clf)):
    accuracy_, time_ = \
    cross_val_report(list(clf.values())[i],X_train,y_train,10)
    accuracy.append(accuracy_)
    time_taken.append(time_)
    print(list(clf.keys())[i]+' took {} sec'.format(time_))

PassiveAggressiveClassifier took 21.0 sec


In [47]:
# dataframe for cross validation

cross_val_data = {'Train Accuracy (%)': accuracy, 'Time (sec)': time_taken}
df1 = pd.DataFrame(index=clf.keys(), data=cross_val_data, columns=cross_val_data.keys())

In [48]:
df1

Unnamed: 0,Train Accuracy (%),Time (sec)
PassiveAggressiveClassifier,88.8 $\pm$ 2.5,21.0


##### Passive Aggressive Classifier (tuned hyperparameters)


In [49]:
hp = {'PassiveAggressiveClassifier':PassiveAggressiveClassifier(C = 10.**np.arange(-3, 3),
                                 max_iter = list(range(1_000,10_100,100)))}


In [50]:
def cross_val_report(hp_,X_,y_,cv_):
    '''This function gives cross-validated training accuracy for given classifier'''
    ti = time.time()
    # pipeline is created to vectorize, Tfidf transform and then classify 
    pipeline = Pipeline([('bow',CountVectorizer()), ('tfidf',TfidfTransformer()), ('hp',hp_)])
    acc_list = cross_val_score(pipeline, X_, y_, cv=cv_, scoring='accuracy')
    mean_ = round(100*acc_list.mean(),1)
    err_ = round(100*np.sqrt(acc_list.std()/(cv_-1)),1)
    accuracy_ = str(mean_)+' $\pm$ '+str(err_)
    
    tf = time.time()
    time_ = round(tf-ti,1)
    
    return accuracy_, time_

In [51]:
accuracy = []
time_taken = []

for i in range(len(hp)):
    accuracy_, time_ = \
    cross_val_report(list(hp.values())[i],X_train,y_train,10)
    accuracy.append(accuracy_)
    time_taken.append(time_)
    print(list(hp.keys())[i]+' took {} sec'.format(time_))

ValueError: 
All the 10 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/home/bianca/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/bianca/.local/lib/python3.10/site-packages/sklearn/pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/bianca/.local/lib/python3.10/site-packages/sklearn/linear_model/_passive_aggressive.py", line 289, in fit
    self._validate_params()
  File "/home/bianca/.local/lib/python3.10/site-packages/sklearn/linear_model/_stochastic_gradient.py", line 136, in _validate_params
    if self.max_iter is not None and self.max_iter <= 0:
TypeError: '<=' not supported between instances of 'list' and 'int'


In [52]:
clf.fit(X_train. y_train)

AttributeError: 'dict' object has no attribute 'fit'