In [1]:
import pandas as pd
import numpy as np
import spacy
import re
import time
import pickle

In [2]:
train = pd.read_csv("train_2kmZucJ.csv")
test =  pd.read_csv("test_oJQbWVk.csv")

In [3]:
train['label'].value_counts(normalize = True)

0    0.744192
1    0.255808
Name: label, dtype: float64

In [4]:
train['clean_tweet'] = train['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))
test['clean_tweet'] = test['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))

In [5]:
punctuation = '!"#$%&()*+-/:;<=>?@[\\]^_`{|}~'

train['clean_tweet'] = train['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

# convert text to lowercase
train['clean_tweet'] = train['clean_tweet'].str.lower()
test['clean_tweet'] = test['clean_tweet'].str.lower()

# remove numbers
train['clean_tweet'] = train['clean_tweet'].str.replace("[0-9]", " ")
test['clean_tweet'] = test['clean_tweet'].str.replace("[0-9]", " ")

# remove whitespaces
train['clean_tweet'] = train['clean_tweet'].apply(lambda x:' '.join(x.split()))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ' '.join(x.split()))

In [8]:

nlp = spacy.load('en', disable=['parser', 'ner'])

# function to lemmatize text
def lemmatization(texts):
    output = []
    for i in texts:
        s = [token.lemma_ for token in nlp(i)]
        output.append(' '.join(s))
    return output


In [9]:
train['clean_tweet'] = lemmatization(train['clean_tweet'])
test['clean_tweet'] = lemmatization(test['clean_tweet'])

In [10]:
train.sample(10)

Unnamed: 0,id,label,tweet,clean_tweet
1434,1435,0,Mommy - Daughter day! Wicked! Broadway fun! #w...,mommy daughter day wicked broadway fun wicked ...
6269,6270,1,Why the fuck does it take so long for an Ipod ...,why the fuck do -PRON- take so long for an ipo...
6312,6313,0,#2014 #New #Year by #Sony #A7 #2870OSS at #종각 ...,new year by sony a oss at 종각 보신각 제야의종 타종행사 seo...
7255,7256,0,Cool girls for you! http://ow.ly/K7W8307ox4a #...,cool girl for -PRON- ff foodporn dj tokyo appl...
4311,4312,0,Had the urge to game so I purchased a PS3 lol ...,have the urge to game so i purchase a ps lol e...
2610,2611,0,An owner and his #dog #australianshepherd #bor...,an owner and -PRON- dog australianshepherd bor...
6242,6243,0,I finally upgraded my phone! From iPhone 5se t...,i finally upgrade -PRON- phone from iphone se ...
4757,4758,0,#Dell computers literally cause more stress th...,dell computer literally cause more stress then...
1704,1705,0,he saw a cute guy and got happy! #william #gay...,-PRON- see a cute guy and get happy william ga...
2188,2189,1,"Thank you Samsung, for the shottiest phone I'v...","thank -PRON- samsung , for the shotti phone -P..."


In [11]:
! pip install "tensorflow>=1.7.0"
! pip install tensorflow-hub

Collecting tensorflow>=1.7.0
  Downloading https://files.pythonhosted.org/packages/7b/14/e4538c2bc3ae9f4ce6f6ce7ef1180da05abc4a617afba798268232b01d0d/tensorflow-1.13.1-cp37-cp37m-win_amd64.whl (63.1MB)
Collecting tensorboard<1.14.0,>=1.13.0 (from tensorflow>=1.7.0)
  Downloading https://files.pythonhosted.org/packages/0f/39/bdd75b08a6fba41f098b6cb091b9e8c7a80e1b4d679a581a0ccd17b10373/tensorboard-1.13.1-py3-none-any.whl (3.2MB)
Collecting gast>=0.2.0 (from tensorflow>=1.7.0)
  Downloading https://files.pythonhosted.org/packages/4e/35/11749bf99b2d4e3cceb4d55ca22590b0d7c2c62b9de38ac4a4a7f4687421/gast-0.2.2.tar.gz
Collecting absl-py>=0.1.6 (from tensorflow>=1.7.0)
  Downloading https://files.pythonhosted.org/packages/da/3f/9b0355080b81b15ba6a9ffcf1f5ea39e307a2778b2f2dc8694724e8abd5b/absl-py-0.7.1.tar.gz (99kB)
Collecting grpcio>=1.8.6 (from tensorflow>=1.7.0)
  Downloading https://files.pythonhosted.org/packages/2a/22/bd327063dd0bdf9d8d640b3185b760707842160e69df909db3fcaab5b758/grpcio-1.20

In [12]:
import tensorflow_hub as hub
import tensorflow as tf

elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

W0512 23:37:17.349824  5816 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14
W0512 23:41:45.071908  5816 deprecation.py:323] From C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


In [13]:
x = ["Roasted ants are a popular snack in Columbia"]

# Extract ELMo features 
embeddings = elmo(x, signature="default", as_dict=True)["elmo"]

embeddings.shape

TensorShape([Dimension(1), Dimension(8), Dimension(1024)])

In [14]:
def elmo_vectors(x):
  embeddings = elmo(x.tolist(), signature="default", as_dict=True)["elmo"]

  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    # return average of ELMo features
    return sess.run(tf.reduce_mean(embeddings,1))

In [15]:
list_train = [train[i:i+100] for i in range(0,train.shape[0],100)]
list_test = [test[i:i+100] for i in range(0,test.shape[0],100)]

In [16]:
# Extract ELMo embeddings
elmo_train = [elmo_vectors(x['clean_tweet']) for x in list_train]
elmo_test = [elmo_vectors(x['clean_tweet']) for x in list_test]

In [17]:
elmo_train_new = np.concatenate(elmo_train, axis = 0)
elmo_test_new = np.concatenate(elmo_test, axis = 0)

In [18]:
# save elmo_train_new
pickle_out = open("train_elmo.pickle","wb")
pickle.dump(elmo_train_new, pickle_out)
pickle_out.close()

# save elmo_test_new
pickle_out = open("test_elmo.pickle","wb")
pickle.dump(elmo_test_new, pickle_out)
pickle_out.close()

In [19]:
# load elmo_train_new
pickle_in = open("train_elmo.pickle", "rb")
elmo_train_new = pickle.load(pickle_in)

# load elmo_test_new
pickle_in = open("test_elmo.pickle", "rb")
elmo_test_new = pickle.load(pickle_in)

In [20]:
from sklearn.model_selection import train_test_split

xtrain, xvalid, ytrain, yvalid = train_test_split(elmo_train_new, 
                                                  train['label'],  
                                                  random_state=42, 
                                                  test_size=0.2)

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

lreg = LogisticRegression()
lreg.fit(xtrain, ytrain)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [22]:
preds_valid = lreg.predict(xvalid)

In [23]:
f1_score(yvalid, preds_valid)

0.7752675386444708

In [24]:
# make predictions on test set
preds_test = lreg.predict(elmo_test_new)

In [25]:
# prepare submission dataframe
sub = pd.DataFrame({'id':test['id'], 'label':preds_test})

# write predictions to a CSV file
sub.to_csv("submission.csv", index=False)

In [7]:
!python -m spacy download en

[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')
symbolic link created for C:\ProgramData\Anaconda3\lib\site-packages\spacy\data\en <<===>> C:\ProgramData\Anaconda3\lib\site-packages\en_core_web_sm
[+] Linking successful
C:\ProgramData\Anaconda3\lib\site-packages\en_core_web_sm -->
C:\ProgramData\Anaconda3\lib\site-packages\spacy\data\en
You can now load the model via spacy.load('en')
