In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn import utils

import matplotlib.pyplot as plt

from textblob import TextBlob

from collections import Counter

In [2]:
df = pd.read_csv('data/train_E6oV3lV.csv')

### TEXT PROCESSING

#### NORMALIZATION AND CLEANING

##### counting hashtags

In [3]:
def separate_hashtags(tweet):
    tweet = re.sub(r'#', ' #', tweet)
    return tweet
df['tweet_sep_hasht'] = df.tweet.apply(separate_hashtags)

In [4]:
hashtag_list_normal = []
hashtag_list_racist = []
i = 0
for t in df['tweet_sep_hasht']:
    if df.label.iloc[i] == 0:
        hashtag_list_normal += [w.strip("#") for w in t.split() if w.startswith("#")]
    else:
        hashtag_list_racist += [w.strip("#") for w in t.split() if w.startswith("#")]
    i+=1
    
counts_n = Counter(hashtag_list_normal)
counts_r = Counter(hashtag_list_racist)

list50_best = [tup[0] for tup in sorted(counts_n.items(), key=lambda x: x[1])[-50:] if len(tup[0])>1]
list50_worst = [tup[0] for tup in sorted(counts_r.items(), key=lambda x: x[1])[-50:] if len(tup[0])>1]
list100_best_worst = list50_best + list50_worst

In [5]:
relevant_hastags_matrix = np.zeros([df.shape[0], 100])
i = 0
for t in df['tweet_sep_hasht']:
    hashtag_list = [w.strip("#") for w in t.split() if w.startswith("#")]
    for h in hashtag_list:
        try:
            ind = list100_best_worst.index(h)
            relevant_hastags_matrix[i,ind] = 1
        except:
            pass
    i+=1

##### cleaning and normalizing

In [6]:
from textacy.preprocess import preprocess_text
df['tweet_processed'] = df['tweet'].apply(lambda x: preprocess_text(x, fix_unicode=True, lowercase=True, no_urls=True, 
                no_emails=True, no_phone_numbers=True, no_numbers=True, 
                no_currency_symbols=True, no_punct=True, no_accents=True, no_contractions=True))
df.tweet_processed = df.tweet_processed.apply(lambda x: separate_emojis(x))

##### emojis

In [3]:
import emoji
keys = list(emoji.EMOJI_UNICODE.values())
values = [" "+i+" "for i in keys]

del values[2620]
del keys[2620]
dictionary_emojis = dict(zip(keys, values))

import re

def separate_emojis(text):
    # use these three lines to do the replacement
    rep = dict((re.escape(k), v) for k, v in dictionary_emojis.items())
    pattern = re.compile("|".join(rep.keys()))
    text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
    return text

In [5]:
df.tweet_processed.iloc[3]

'model i love u take with u all the time in ur📱 😙😎👄👅💦💦💦'

In [6]:
df.tweet_processed = df.tweet_processed.apply(lambda x: separate_emojis(x))

In [7]:
df.tweet_processed.iloc[3]

'model i love u take with u all the time in ur 📱   😙  😎  👄  👅  💦  💦  💦 '

In [8]:
#remove double+ whitespace and words with only one letter
def processTweet(tweet):
    #-- Remove words with 1 or fewer letters
    tweet = re.sub(r'\b\w{1}\b', '', tweet)
    #-- Remove whitespace (including new line characters)
    tweet = re.sub(r'\s\s+', ' ', tweet)
    #-- Remove single space remaining at the front of the tweet.
    tweet = tweet.lstrip(' ')
    return tweet

df.tweet_processed = df.tweet_processed.apply(lambda x: processTweet(x))

#### REMOVE STOP WORDS

In [9]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
#df['tweet_processed'] = df['tweet_processed'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

#### STEMMING

In [10]:
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
df['tweet_processed'] = df['tweet_processed'].apply(lambda x: " ".join([porter_stemmer.stem(word) for word in x.split()]))

### FEATURE EXTRACTION

#### TFIDF

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
tvec = TfidfVectorizer(max_features=4000, ngram_range=(1, 1))
tf_idf_vector = tvec.fit_transform(df.tweet_processed)

#### Doc2Vec

In [19]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence



In [20]:
labeled_tweets = []
for i,t in zip(df.index,df.tweet_processed):
    labeled_tweets.append(LabeledSentence(t.split(), [str(i)]))

  This is separate from the ipykernel package so we can avoid doing imports until


In [21]:
model_dbow = Doc2Vec(dm=0,size=100,negative=5,min_count=2,alpha=0.065,min_alpha=0.065)



In [22]:
model_dbow.build_vocab([x for x in labeled_tweets])
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in labeled_tweets]), total_examples=len(labeled_tweets), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

In [23]:
doc2_vecs = np.zeros((df.shape[0], 100))
j = 0
for i in df.index:
    prefix = str(i)
    doc2_vecs[i] = model_dbow.docvecs[i]
    j += 1

#### Word embedding (glove)


In [None]:
import gensim.downloader as api
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
glove_twitter = api.load("glove-twitter-200")

embed_size = 200 # size of each word vector
max_features = 4000 # amount of unique words to use (i.e num rows in embedding vector)
maxlen = 50 # max number of words in a comment to use

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df.tweet_processed.tolist())
list_tokenized_train = tokenizer.texts_to_sequences(df.tweet_processed.tolist())
X_w2v = pad_sequences(list_tokenized_train, maxlen=maxlen)

##### Document level

In [None]:
def get_w2v_general(tweet, size, vectors, aggregation='mean'):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tweet.split():
        try:
            vec += vectors[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if aggregation == 'mean':
        if count != 0:
            vec /= count
        return vec
    elif aggregation == 'sum':
        return vec

In [None]:
#Create a document representation using the embeddings of each word
X_glove = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'sum') for z in df.tweet_processed]))

##### Word level

In [36]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [35]:
embed_size = 200 # how big is each word vector
max_features = 4000 # how many unique words to use (i.e num rows in embedding vector) used 20000 beacause in the training that value contains all the tokens
maxlen = 50 # max number of words in a comment to use

In [41]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df.tweet_processed.tolist())
list_tokenized_train = tokenizer.texts_to_sequences(df.tweet_processed.tolist())
X_w2v = pad_sequences(list_tokenized_train, maxlen=maxlen)

In [42]:
#copute mean and std to fill words that were not trained
emb_mean,emb_std = glove_twitter.vectors.mean(), glove_twitter.vectors.std()

In [43]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    try:
        embedding_vector = glove_twitter[word]
    except:
        embedding_vector = None
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

#### Joining features

In [15]:
#X = np.concatenate([tf_idf_vector.toarray(), doc2_vecs, relevant_hastags_matrix], axis=1)
X = tf_idf_vector.toarray()
y = df['label']

In [14]:
from sklearn.preprocessing import scale
X = scale(X,axis=1)

In [32]:
del XX

In [48]:
from sklearn.decomposition import PCA
pca = PCA(n_components=750)
X = pca.fit_transform(X)

MemoryError: 

In [133]:
from sklearn.decomposition import IncrementalPCA
ipca = IncrementalPCA(n_components=150, batch_size=150)
X = ipca.fit_transform(X)

  explained_variance[self.n_components_:].mean()
  ret = ret.dtype.type(ret / rcount)


### MODELS

#### XGBoost

In [46]:
from imblearn.under_sampling import RandomUnderSampler

In [19]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_score, recall_score, precision_recall_curve, classification_report
from imblearn.over_sampling import SMOTE

In [20]:
base_weights = 1/(df.label.value_counts()/max(df.label.value_counts())).values
weights = np.array([base_weights[0]  if x == 0 else base_weights[1] for x in df.label])

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [25]:
# create object for cross validation (5 folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=20)
fold = 1

precisions = []
recalls = []

print("training")
for train_index, test_index in skf.split(X, y):
    # split data
    X_train, X_test = X[train_index, :], X[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    weights_train = weights[train_index]
    
    """
    #sample data
    unique, counts = np.unique(y_train, return_counts=True)
    d = dict(zip(unique, counts))
    # undersample data points
    
    undersampler = RandomUnderSampler(sampling_strategy={0: d[1],
                                                         1: d[1]})
    
    oversampler = SMOTE(sampling_strategy={0: d[0],
                                           1: d[0],
                                           },
                        random_state=20)
    """
    #X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)
    #X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)
    #X_resampled[X_resampled==-1] = np.NaN
    
    #X_resampled = pd.DataFrame(X_resampled, columns=X_train.columns.values)
    print("creating model")
    xgb_model = RandomForestClassifier(n_estimators=500, class_weight='balanced', n_jobs=-1).fit(X_train, y_train)
    """xgb_model = XGBClassifier(objective='binary:logistic',
                              n_estimators=20,
                              max_depth=8,
                              learning_rate=0.025,
                              n_jobs=-1,
                              random_state=20)
    print("training model")
    xgb_model.fit(X_train, 
                  y_train,
                  #sample_weight=weights_train,
                  eval_set=[(X_test, y_test)],
                  eval_metric=['logloss'],
                  early_stopping_rounds=30,
                  verbose=False
                  )
    """
    print("predicting")
    # predict outputs
    train_predictions = xgb_model.predict(X_train)
    train_actuals = y_train
    
    predictions = xgb_model.predict(X_test)
    prediction_probs = xgb_model.predict_proba(X_test)
    actuals = y_test
    
    ## get train performance metrics
    #print("Fold: ", fold)
    print('Train report')
    print(classification_report(train_actuals, train_predictions)) 
    #print('------------')
    
    # get performance metrics
    print('Test report')
    print(classification_report(actuals, predictions)) 
    #precisions.append(precision_score(actuals, predictions, average=None))
    #recalls.append(recall_score(actuals, predictions, average=None))
    break
    

training
creating model
predicting
Train report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     23776
           1       1.00      1.00      1.00      1793

   micro avg       1.00      1.00      1.00     25569
   macro avg       1.00      1.00      1.00     25569
weighted avg       1.00      1.00      1.00     25569

Test report
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      5944
           1       0.91      0.40      0.56       449

   micro avg       0.96      0.96      0.96      6393
   macro avg       0.94      0.70      0.77      6393
weighted avg       0.95      0.96      0.95      6393



#### Keras

In [120]:
from keras.models import Sequential #Sequential Models
from keras.layers import Dense, BatchNormalization, Activation, Dropout #Dense Fully Connected Layer Type
from keras.optimizers import SGD #Stochastic Gradient Descent Optimizer
from sklearn.preprocessing import OneHotEncoder
import sklearn.metrics as metrics

In [173]:
def create_network(n_columns, lr=0.001):
    model = Sequential()
    
    model.add(Dense(1024, input_shape=(n_columns,)))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    #model.add(Dropout(0.1))
              
    model.add(Dense(512))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    #model.add(Dropout(0.1))
    
    model.add(Dense(256))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    #model.add(Dropout(0.1))
    
    model.add(Dense(2, activation='softmax'))
        
    #stochastic gradient descent
    sgd = SGD(lr=lr, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='sparse_categorical_crossentropy',optimizer=sgd)
    return model

In [174]:
base_weights = 1/(df.label.value_counts()/max(df.label.value_counts())).values
#base_weights[1] = base_weights[1]*1.4

In [175]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=20)

f = []
#start cross validation
for train_idx, test_idx in kf.split(X, y):
    x_train, x_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
        
    #create model
    model = create_network(x_train.shape[1], 0.05)
    for i in range(100):
        model.fit(x_train, y_train, batch_size=int(x_train.shape[0]), 
                      epochs=1, verbose=1, class_weight={0:base_weights[0],1:base_weights[1]}, 
                  validation_data=(x_test, y_test))

        y_train_pred = model.predict(x_train).argmax(axis=1)
        y_test_pred = model.predict(x_test).argmax(axis=1)
        f1s = metrics.f1_score(y_test, y_test_pred)
        f.append(f1s)
        print(f1s)
        #print("train\n",metrics.classification_report(y_train, y_train_pred))
        #print("test\n",metrics.classification_report(y_test, y_test_pred))

Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.18470539072294187
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.36003861003861
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.4963350785340314
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.4662402274342572
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.4816446402349487
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.4928193499622071
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.5181598062953995
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.525781910397295
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.5242718446601942
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.530396475770925
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.5340136054421769
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.5353535353535354
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.531223

KeyboardInterrupt: 

In [176]:
def create_network(n_columns, lr=0.001):
    model = Sequential()
    
    model.add(Dense(1024, input_shape=(n_columns,)))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    #model.add(Dropout(0.1))
              
    model.add(Dense(512))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    #model.add(Dropout(0.1))
    
    model.add(Dense(256))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    #model.add(Dropout(0.1))
    
    model.add(Dense(1, activation='sigmoid'))
        
    #stochastic gradient descent
    sgd = SGD(lr=lr, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='binary_crossentropy',optimizer=sgd)
    return model

In [184]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=20)

f = []
#start cross validation
for train_idx, test_idx in kf.split(X, y):
    x_train, x_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
        
    #create model
    model = create_network(x_train.shape[1], 0.05)
    for i in range(100):
        model.fit(x_train, y_train, batch_size=int(x_train.shape[0]), 
                      epochs=1, verbose=1, class_weight={0:base_weights[0],1:base_weights[1]}, 
                  validation_data=(x_test, y_test))

        y_train_pred = np.around(model.predict(x_train))
        y_test_pred = np.around(model.predict(x_test))
        f1s = metrics.f1_score(y_test, y_test_pred)
        f.append(f1s)
        print(f1s)
        #print("train\n",metrics.classification_report(y_train, y_train_pred))
        #print("test\n",metrics.classification_report(y_test, y_test_pred))

Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.21091997008227376
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.3337868480725624
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.3738317757009346
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.4011299435028248
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.4297820823244553
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.4441575209812783
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.47491166077738517
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.5022970903522205
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.529074529074529
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.5512367491166078
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.5583566760037348
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.5700483091787439
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.58

0.6088709677419355
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.6132264529058116
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.6127744510978045
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.6123260437375746
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.6106719367588933
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.6094674556213018
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.6070726915520629
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.603515625
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.6064139941690962
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.6077669902912622
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.6060019361084221
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.6060019361084221
Train on 25569 samples, validate on 6393 samples
Epoch 1/1
0.6051873198847262
Train on 25569 samples, validate on 6393 samples
Epo

KeyboardInterrupt: 

In [44]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
#from keras.models import Model
from keras.models import Sequential
from keras import initializers, regularizers, constraints, optimizers, layers

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_score, recall_score, precision_recall_curve, classification_report
import sklearn.metrics as metrics

In [45]:
base_weights = 1/(df.label.value_counts()/max(df.label.value_counts())).values

In [46]:
X = X_w2v
y = df.label

In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=20)
f_train = []
f = []
#start cross validation
for train_idx, test_idx in kf.split(X, y):
    x_train, x_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
        
    #create model
    
    model = Sequential()
    model.add(Embedding(max_features, embed_size, input_length = maxlen, weights=[embedding_matrix],trainable=True))
    model.add(Bidirectional(LSTM(100, dropout_U = 0.05, dropout_W = 0.05, return_sequences=False)))
    #model.add(Dense(128, activation = 'relu'))
    model.add(Dense(2, activation = 'softmax'))
    model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam')
    
    """inp = Input(shape=(maxlen,))#seq_length
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)#max_features-number of words in vocab #embed_size- size glove/word2vec vectors
    x = Bidirectional(LSTM(256, return_sequences=False, dropout=0.1, recurrent_dropout=0.1))(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.05)(x)
    x = Dense(2, activation="softmax")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')"""
    
    for i in range(30):
        model.fit(x_train, y_train, nb_epoch = 2, batch_size = int(x_train.shape[0]), 
                  verbose = 2, class_weight={0:base_weights[0],1:base_weights[1]}, 
                  validation_data=(x_test, y_test))

        y_train_pred = model.predict(x_train).argmax(axis=1)
        y_test_pred = model.predict(x_test).argmax(axis=1)
        f1train = metrics.f1_score(y_train, y_train_pred)
        f1test = metrics.f1_score(y_test, y_test_pred)
        f_train.append(f1train)
        f.append(f1test)
        print(i)
        print(f1train, f1test)