# Libraries and packages

In [1]:
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt
import nltk
import scipy
import time

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import PorterStemmer
from multiprocessing import Process
from sklearn.linear_model import LogisticRegression
from spacy.tokenizer import Tokenizer
from spacy.vocab import Vocab
from spacy.language import Language

# Data

In [3]:
nlp = Language(Vocab())
tokenizer = Tokenizer(nlp.vocab)
def tokenization_process(string):
    aux = tokenizer(string)
    output = list()
    for word in aux:
        output.append(str(word))
    return output

In [None]:
print(type(nltk.word_tokenize(Traintable.iloc[0].comment_text)[0]))
tokenization_process(Traintable.iloc[0].comment_text)

In [34]:
Traintable = pd.read_csv('Recursos/comments_toxicos/Dados/train.csv')
Testtable = pd.read_csv('Recursos/comments_toxicos/Dados/test.csv')
subm = pd.read_csv('Recursos/comments_toxicos/Dados/sample_submission.csv')

In [5]:
label_cols = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [6]:
RawX = Traintable.comment_text
RawX_test = Testtable.comment_text

In [7]:
RawY = list()
for label in label_cols:
    RawY.append(Traintable[label])

# Data analysis

In [8]:
count = (Traintable.toxic + Traintable.severe_toxic + Traintable.insult + Traintable.threat
         + Traintable.obscene + Traintable.identity_hate)

In [9]:
Traintable['sum'] = count

In [None]:
print(Traintable[Traintable.comment_text == ''].shape[0])
print(Testtable[Testtable.comment_text == ''].shape[0])

In [10]:
Traintable['comment_text'].fillna('Text Missing', inplace=True)
Testtable['comment_text'].fillna('Text Missing', inplace=True)

#### *Uppercase Count*

In [58]:
def count_upper_norm(X):
    uppercasecount = list()
    for i in range(X.shape[0]):
        comment = X[i]
        count = 0
        charcount = 0
        for character in comment:
            if character.isupper():
                count += 1 
        uppercasecount.append(count/len(comment))
    return uppercasecount

In [59]:
Traintable['uppercase_count_norm'] = count_upper_norm(Traintable.comment_text)

In [60]:
Testtable['uppercase_count_norm'] = count_upper_norm(Testtable.comment_text)

#### *Swear Count*

In [61]:
with open('Recursos/comments_toxicos/full-list-of-bad-words-banned-by-google.csv') as f:
  reader = csv.reader(f)
  swearsBoW = list(reader)
for i in range(1,len(swearsBoW)):
    for word in swearsBoW[i]:
            swearsBoW[0].append(word)
swearsBoW = swearsBoW[0]
newlist = list()
swearsBoW_stemm = list()
ps = PorterStemmer()
for word in swearsBoW:
    newword = word.strip()
    newword2 = newword.lower()
    newlist.append(newword2)
    swearsBoW_stemm.append(ps.stem(newword2))
swearsBoW = newlist

In [62]:
def swear_count_norm(comments):
    swear_count = list()
    wordlist = [tokenizer(comment.lower()) for comment in comments]
    print('tokenized')
    f.value = 0
    for sentence in wordlist:
        #count = 0
        #for word.lower() in sentence:
            #if word.lower() in swearsBoW or ps.stem(lowerword) in swearsBoW_stemm:
                #count += 1
        #swear_count.append(count/len(sentence))
        try:  
            x = [1 for word in sentence if str(word) in swearsBoW or ps.stem(str(word)) in swearsBoW_stemm]
        except:
            x = [0]
        swear_count.append(float(sum(x)/len(sentence)))
        f.value += 1
    return swear_count

In [63]:
Traintable['swear_count_norm'] = swear_count_norm(Traintable.comment_text)

tokenized


In [64]:
Testtable['swear_count_norm'] = swear_count_norm(Testtable.comment_text)

tokenized


In [None]:
Traintable[['uppercase_count_norm','sum']].groupby('sum').mean()

In [None]:
Traintable[['sum','swear_count_norm']].groupby('sum').mean()

In [18]:
TFIDF = TfidfVectorizer(max_df= 0.8 , min_df= 2, tokenizer = tokenization_process, ngram_range= (1,2),
                        stop_words = 'english')
TF = CountVectorizer(max_df= 0.8 , min_df= 2, tokenizer = tokenization_process, ngram_range= (1,2),
                     stop_words = 'english')

In [36]:
X= TFIDF.fit_transform(RawX)
X_test = TFIDF.transform(RawX_test)

In [20]:
def featureselect(Model, X, Y):
    Model.fit(X, Y)
    print('Model Fited')
    featureselectmodel = SelectFromModel(Model, prefit = True)
    NewX = featureselectmodel.transform(X)
    return NewX

In [21]:
def join_sparce_series(sparse,series):
    aux = csr_matrix((series))
    aux = aux.transpose()
    return scipy.sparse.hstack([sparse,aux])

In [65]:
final_train_X = join_sparce_series(X,Traintable.uppercase_count_norm)
final_train_X = join_sparce_series(final_train_X,Traintable.swear_count_norm)
final_test_X = join_sparce_series(X_test,Testtable.uppercase_count_norm)
final_test_X = join_sparce_series(final_test_X,Testtable.swear_count_norm)

In [66]:
final_train_X = scipy.sparse.csr_matrix(final_train_X)
final_test_X = scipy.sparse.csr_matrix(final_test_X)

In [45]:
logisticmodel = LogisticRegression(C=4, dual=True)

In [46]:
Prediction = pd.DataFrame()
preds = np.zeros((len(Testtable), len(label_cols)))
for i,j in enumerate(label_cols):
    logisticmodel.fit(final_train_X, Traintable[j])
    preds[:,i] = logisticmodel.predict_proba(final_test_X)[:,1]

In [47]:
Prediction = pd.DataFrame(Testtable['id'])
Prediction = pd.concat([Prediction, pd.DataFrame(preds, columns = label_cols)], axis= 1)
Prediction.to_csv('submission.csv', index = False)

In [None]:
final_train_X.shape

In [None]:
print(final_train_X.shape)
print(final_test_X.shape)

In [None]:
model = MultinomialNB()

In [None]:
model.fit(final_train_X,RawY[0])

In [None]:
model.feature_log_prob_

### Jeremy's NBSVM

In [67]:
def pr(y_i, y):
    p = final_train_X[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [68]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    print(r.shape)
    print(X.shape)
    m = LogisticRegression(C=4, dual=True)
    x_nb = final_train_X.multiply(r)
    return m.fit(x_nb, y), r

In [69]:
preds = np.zeros((len(Testtable), len(label_cols)))

for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(Traintable[j])
    preds[:,i] = m.predict_proba(final_test_X.multiply(r))[:,1]

fit toxic
(1, 426007)
(159571, 426005)
fit severe_toxic
(1, 426007)
(159571, 426005)
fit obscene
(1, 426007)
(159571, 426005)
fit threat
(1, 426007)
(159571, 426005)
fit insult
(1, 426007)
(159571, 426005)
fit identity_hate
(1, 426007)
(159571, 426005)


In [70]:
submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = label_cols)], axis=1)
submission.to_csv('submission.csv', index=False)

In [None]:
Var = final_train_X.multiply(np.log(pr(1,RawY[0])/pr(0,RawY[0])))

In [None]:
print(Var)
print(model.class_prior)

In [None]:
scipy.sparse.csr_matrix(final_train_X)

In [None]:
type(final_train_X)

In [None]:
a = (1,2,3)
b = [1,2,3]

In [None]:
b[0] = 25

In [None]:
b

In [None]:
x = [ 1 for i in range(1,3)]
sum(x)

In [None]:
len(x)

###### Tabela pequena de testes

In [None]:
Smalltable = Traintable.iloc[range(0,int(Traintable.shape[0]/100))]

In [None]:
swear_count_norm(Smalltable.comment_text)

In [None]:
Smalltable.iloc[295].comment_text

In [None]:
Smalltable['swear_count_norm'].describe()

In [None]:
Swearscount = swear_count_norm(Smalltable.comment_text)

In [None]:
Smalltable.shape
Smalltable['comment_text']
type(Smalltable['comment_text'])
i = 0
for comment in Smalltable['comment_text']:
    if i == 1:
        break
    i = 1
    print(type(comment))


In [None]:
Smalltable['swear_count_norm'] = Swearscount

In [None]:
Traintable['swear_count_norm'] = swear_count_norm(Traintable.comment_text)

In [None]:
type(f.value)

In [None]:
x = [Traintable.iloc[int(f.value)].comment_text, Traintable.iloc[1].comment_text]

In [None]:
swear_count_norm(x)

In [None]:
Tokenizer?

In [None]:
 tokenizer = Tokenizer(Vocab)

In [None]:
import nlp

In [None]:
Tokenizer?

In [None]:
from spacy.vocab import Vocab

In [None]:
Vocab?

In [None]:
Comments = Smalltable.comment_text
time0 = time.time()
wordlist = [tokenizer(comment) for comment in Comments]
time1 = time.time()
print(time1-time0)
wordlist = [nltk.word_tokenize(comment) for comment in Comments]
time2 = time.time()
print(time2-time1)

In [None]:
Traintable.swear_count_norm.describe()

In [None]:
def CBTW_vec(Term_Frequency_Matrix, Y):
    A , B , C = [list(),list(),list()]
    aux = time.time()
    aux2=0
    Y = Y.values
    for column in range(Term_Frequency_Matrix.shape[1]):
        aux_table = Term_Frequency_Matrix[:,column].toarray().flatten()
        A.append((Y[aux_table !=0].sum())+1)
        auxY = Y[aux_table !=0]
        B.append((auxY.shape[0]-np.count_nonzero(auxY))+1)
        C.append((Y[aux_table != 0].sum())+1)              
        aux2 += 1
        if aux2 % 100 == 0:
            print(time.time()-aux)
            aux = time.time()
    A = np.array(A)
    B = np.array(B)
    C = np.array(C)    
    return np.log(1+(A/B)*(A/C))

In [None]:
TermFrequency = TF.fit_transform(RawX)

In [None]:
TermFrequency.shape

In [None]:
CBTW_vec(TermFrequency, RawY[0])

In [None]:
from ipywidgets import FloatProgress
from IPython.display import display
f = FloatProgress(min=0, max=620924)
display(f)

In [None]:
f.value

In [None]:
np.apply_over_axes

In [None]:
A = np.array([[2,3,4,5],[6,7,8,9]])
np.sum(A, axis = 1)

In [None]:
x = time.time()

In [None]:
x

In [None]:
y = time.time()
y

In [None]:
print(y-x)

In [None]:
aux = time.time()
newtab = TF.fit_transform(Traintable.comment_text)
print(time.time()-aux)

In [None]:
TermFrequency

In [48]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [52]:
COMMENT = 'comment_text'
n = Traintable.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )
trn_term_doc = vec.fit_transform(Traintable[COMMENT])
test_term_doc = vec.transform(Testtable[COMMENT])

In [53]:
X = trn_term_doc
X_test = test_term_doc