In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../data/train.csv")
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

### Logistic Regression Using Count Vectors and TFIDF-vectors

#### Split train-test data

In [6]:
from sklearn.model_selection import train_test_split

df_train,df_test = train_test_split(df,test_size=0.2)

CountVectorizer can lowercase letters, disregard punctuation and stopwords, but it can't LEMMATIZE or STEM ...

Analysing Count Vectorizer ... 

1. Feature in Bag of Words (BOW)

In [55]:
count_vec = CountVectorizer(stop_words="english", 
                            analyzer='word', 
                            ngram_range=(1, 1), 
                            max_df=1.0, 
                            min_df=1, 
                            max_features=None)

In [56]:
text = [df_train["comment_text"][0]]

In [59]:
count_train = count_vec.fit(text)
bag_of_words = count_vec.transform(text)

print("Every feature:\n{}".format(count_vec.get_feature_names()))

Every feature:
['205', '27', '38', '89', 'closure', 'dolls', 'don', 'edits', 'explanation', 'fac', 'fan', 'gas', 'hardcore', 'just', 'metallica', 'new', 'page', 'remove', 'retired', 'reverted', 'talk', 'template', 'username', 'vandalisms', 'voted', 'weren', 'york']


Vocabulary and Vocabulary ID

In [64]:
print("Vocabulary Size : {} ".format(len(count_train.vocabulary_)))
print("Vocabulary content:\n {}".format(count_train.vocabulary_.keys()))

Vocabulary Size : 27 
Vocabulary content:
 dict_keys(['explanation', 'edits', 'username', 'hardcore', 'metallica', 'fan', 'reverted', 'weren', 'vandalisms', 'just', 'closure', 'gas', 'voted', 'new', 'york', 'dolls', 'fac', 'don', 'remove', 'template', 'talk', 'page', 'retired', '89', '205', '38', '27'])


2. N-Grams (sets of consecutive words) [N=(1,2)]

In [65]:
count_vec = CountVectorizer(stop_words="english",ngram_range=(1,2))

In [66]:
text = [df_train["comment_text"][0]]

In [67]:
count_train = count_vec.fit(text)
bag_of_words = count_vec.transform(text)

In [68]:
print(count_vec.get_feature_names())

['205', '205 38', '27', '38', '38 27', '89', '89 205', 'closure', 'closure gas', 'dolls', 'dolls fac', 'don', 'don remove', 'edits', 'edits username', 'explanation', 'explanation edits', 'fac', 'fac don', 'fan', 'fan reverted', 'gas', 'gas voted', 'hardcore', 'hardcore metallica', 'just', 'just closure', 'metallica', 'metallica fan', 'new', 'new york', 'page', 'page retired', 'remove', 'remove template', 'retired', 'retired 89', 'reverted', 'reverted weren', 'talk', 'talk page', 'template', 'template talk', 'username', 'username hardcore', 'vandalisms', 'vandalisms just', 'voted', 'voted new', 'weren', 'weren vandalisms', 'york', 'york dolls']


3. N-Grams (sets of consecutive words) [N=(1,2)]

In [70]:
count_vec = CountVectorizer(stop_words="english" , ngram_range = (1,3))

In [71]:
count_train = count_vec.fit(text)
bag_of_words = count_vec.transform(text)

In [72]:
print(count_vec.get_feature_names())

['205', '205 38', '205 38 27', '27', '38', '38 27', '89', '89 205', '89 205 38', 'closure', 'closure gas', 'closure gas voted', 'dolls', 'dolls fac', 'dolls fac don', 'don', 'don remove', 'don remove template', 'edits', 'edits username', 'edits username hardcore', 'explanation', 'explanation edits', 'explanation edits username', 'fac', 'fac don', 'fac don remove', 'fan', 'fan reverted', 'fan reverted weren', 'gas', 'gas voted', 'gas voted new', 'hardcore', 'hardcore metallica', 'hardcore metallica fan', 'just', 'just closure', 'just closure gas', 'metallica', 'metallica fan', 'metallica fan reverted', 'new', 'new york', 'new york dolls', 'page', 'page retired', 'page retired 89', 'remove', 'remove template', 'remove template talk', 'retired', 'retired 89', 'retired 89 205', 'reverted', 'reverted weren', 'reverted weren vandalisms', 'talk', 'talk page', 'talk page retired', 'template', 'template talk', 'template talk page', 'username', 'username hardcore', 'username hardcore metallica',

4. TfIDFVectorizer 

formula used:
             tf-idf(d, t) = tf(t) * idf(d, t)

            * tf(t)= the term frequency is the number of times the term appears in the document
            * idf(d, t) = the document frequency is the number of documents 'd' that contain term 't'

In [74]:
txt1 = ['His smile was not perfect', 'His smile was not not not not perfect', 'she not sang']

In [75]:
tf = TfidfVectorizer(smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
txt_fitted = tf.fit(txt1)
txt_transformed = txt_fitted.transform(txt1)
print ("The text: ", txt1)

The text:  ['His smile was not perfect', 'His smile was not not not not perfect', 'she not sang']


In [79]:
tf.vocabulary_

{'his': 1.4054651081081644, 'not': 1.0, 'perfect': 1.4054651081081644, 'sang': 2.09861228866811, 'she': 2.09861228866811, 'smile': 1.4054651081081644, 'was': 1.4054651081081644}


In [81]:
idf = tf.idf_

In [83]:
print(dict(zip(txt_fitted.get_feature_names(), idf)))

{'his': 1.4054651081081644, 'not': 1.0, 'perfect': 1.4054651081081644, 'sang': 2.09861228866811, 'she': 2.09861228866811, 'smile': 1.4054651081081644, 'was': 1.4054651081081644}


In [85]:
feature_names = np.array(tf.get_feature_names())
sorted_by_idf = np.argsort(tf.idf_)

print("Features with lowest idf: \n{}".format(feature_names[sorted_by_idf[:3]]))
print("Features with highest idf: \n{}".format(feature_names[sorted_by_idf[-3:]]))

Features with lowest idf: 
['not' 'his' 'perfect']
Features with highest idf: 
['was' 'sang' 'she']
