Re-training word2vec models on various unlabeled datasets and using the new models as input features to classify labeled data.

In [1]:
import cython
import pandas as pd
import numpy as np
import re
import logging
from bs4 import BeautifulSoup

import nltk.data
from nltk.corpus import stopwords

from gensim.models import word2vec
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification

In [2]:
unlabeled = pd.read_csv("toxic_train.csv", encoding="latin-1")                    # jigsaqw kaggle
#unlabeled = pd.read_csv("unlabeled_data.csv")

#unlabeled1 = pd.read_csv("aggression_annotated_comments.tsv")                     # Wikipedia 
#unlabeled2 = pd.read_csv("toxicity_annotated_comments.tsv")                       # Wikipedia
#unlabeled = unlabeled1.append(unlabeled2)

#trainGB = pd.read_csv("GBcomments.csv", encoding="latin-1")                       # yt us comments
#trainUS = pd.read_csv("UScomments.csv", encoding="latin-1")                       # yt uk comments
#unlabeled = trainGB.append(trainUS)

stops = set(stopwords.words("english"))
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')                     # Load the punkt tokenizer


def tweet_to_wordlist( tweet, remove_stopwords=False):
    tweet_text = BeautifulSoup(tweet).get_text()                                  # Remove HTML
    tweet_text = re.sub("[^a-zA-Z]"," ", tweet_text)                              # Remove non-letters
    words = tweet_text.lower().split()                                            # Convert to lower case and split
    if remove_stopwords:                                                          # Optionally remove stop words 
        words = [w for w in words if not w in stops]
    return(words)                                                                 # Return a list of words

def tweet_to_sentences(tweet, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(tweet.strip())                             # Split the paragraph into sents.
    sentences = []
    for raw_sentence in raw_sentences:                                            # Loop over each sentence
        if len(raw_sentence) > 0:                                                 # If a sentence is empty, skip it
            sentences.append(tweet_to_wordlist(raw_sentence, remove_stopwords))   # Call tweet_to_wordlist to get wordlist
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [3]:
sentences = []                                                                    # Initialize an empty sentence list

print ("Parsing sentences from training set")
for tweet in unlabeled["comment_text"]:
    sentences += tweet_to_sentences(tweet, tokenizer)

Parsing sentences from training set


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
http://en.wikipedia.org/wiki/Category:Cannes_Film_Festival
http://en.wikipedia.org/wiki/Category:Venice_Film_Festival
etc."" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup

....yeah." looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to 


____________________________________________________________________" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup

NOTICE!" looks like a URL. Beautiful Soup is not an HTTP client. You should proba

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
http://www.radiokhushi.com/music/telugu_songs.cgi?lang=t&movie;=apuroopam&rtype;=actress&name;=Priyanka%20Chopra" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
h?" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that docu

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup

"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup

Eight?" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup

Nine?" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that docume

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup

http://www.britneyspears.org/" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup

Thankyou." looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
 

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Sou

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup

"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
75.102.128.133" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup

Thanks!" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_ma

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup

R.Searle" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup

Woah!" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beaut

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
http://de.youtube.com/watch?v=fgVEtaUx070
http://de.youtube.com/watch?v=Ijh2Fqd1ZPY"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that doc

  ' that document to Beautiful Soup.' % decoded_markup

3." looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup

Wow!" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that 

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup

Hi!" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.'


http://www.ibtimes.com/high-profile-indian-journalist-charged-rape-tarun-tejpal-victim-conspiracy-1555993" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup

Thanks,

mike" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup

Well,.

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
https://en.wikipedia.org/wiki/Self-disclosure
https://en.wikipedia.org/wiki/Subject-expectancy_effect" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
http://www.lifesitenews.com/ldn/2009/jan/09012207.html" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_mar

  ' that document to Beautiful Soup.' % decoded_markup

68.183.100.3" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
http://elandslide.org/display.cfm?id=181" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup

etc." looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the do

  ' that document to Beautiful Soup.' % decoded_markup
http://www.truthout.org/docs_01/01.05B.Klausutis.1.htm
http://www.americanpolitics.com/20030721Baker.html" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
77.229.126.143" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup

173.26.162.40" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed tha

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup

2." looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup


In [4]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
print ("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, size = num_features, min_count = min_word_count,
                          window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace = True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)
#model.word2vec_model.wv.save_word2vec_format(model_name)

2019-04-05 13:09:43,288 : INFO : 'pattern' package not found; tag filters are not available for English
2019-04-05 13:09:43,304 : INFO : collecting all words and their counts
2019-04-05 13:09:43,304 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-04-05 13:09:43,382 : INFO : PROGRESS: at sentence #10000, processed 156306 words, keeping 14649 word types
2019-04-05 13:09:43,460 : INFO : PROGRESS: at sentence #20000, processed 315608 words, keeping 21801 word types


Training model...


2019-04-05 13:09:43,538 : INFO : PROGRESS: at sentence #30000, processed 463024 words, keeping 27239 word types
2019-04-05 13:09:43,616 : INFO : PROGRESS: at sentence #40000, processed 616031 words, keeping 32105 word types
2019-04-05 13:09:43,694 : INFO : PROGRESS: at sentence #50000, processed 769480 words, keeping 36426 word types
2019-04-05 13:09:43,772 : INFO : PROGRESS: at sentence #60000, processed 925504 words, keeping 40436 word types
2019-04-05 13:09:43,866 : INFO : PROGRESS: at sentence #70000, processed 1086615 words, keeping 44484 word types
2019-04-05 13:09:43,944 : INFO : PROGRESS: at sentence #80000, processed 1235894 words, keeping 47871 word types
2019-04-05 13:09:44,037 : INFO : PROGRESS: at sentence #90000, processed 1385716 words, keeping 51166 word types
2019-04-05 13:09:44,146 : INFO : PROGRESS: at sentence #100000, processed 1531970 words, keeping 54109 word types
2019-04-05 13:09:44,224 : INFO : PROGRESS: at sentence #110000, processed 1686312 words, keeping 57

2019-04-05 13:09:50,215 : INFO : deleting the raw counts dictionary of 168807 items
2019-04-05 13:09:50,215 : INFO : sample=0.001 downsamples 54 most-common words
2019-04-05 13:09:50,230 : INFO : downsampling leaves estimated 7613068 word corpus (74.0% of prior 10282083)
2019-04-05 13:09:50,308 : INFO : estimated required memory for 10838 words and 300 dimensions: 31430200 bytes
2019-04-05 13:09:50,308 : INFO : resetting layer weights
2019-04-05 13:09:50,636 : INFO : training model with 4 workers on 10838 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
2019-04-05 13:09:51,697 : INFO : EPOCH 1 - PROGRESS: at 3.62% examples, 264482 words/s, in_qsize 6, out_qsize 2
2019-04-05 13:09:52,726 : INFO : EPOCH 1 - PROGRESS: at 7.83% examples, 290240 words/s, in_qsize 7, out_qsize 0
2019-04-05 13:09:53,725 : INFO : EPOCH 1 - PROGRESS: at 12.59% examples, 311163 words/s, in_qsize 7, out_qsize 0
2019-04-05 13:09:54,754 : INFO : EPOCH 1 - PROGRESS: at 17.26% examples, 

2019-04-05 13:10:54,206 : INFO : EPOCH 3 - PROGRESS: at 71.53% examples, 312216 words/s, in_qsize 7, out_qsize 0
2019-04-05 13:10:55,220 : INFO : EPOCH 3 - PROGRESS: at 74.85% examples, 308704 words/s, in_qsize 8, out_qsize 0
2019-04-05 13:10:56,265 : INFO : EPOCH 3 - PROGRESS: at 78.97% examples, 308197 words/s, in_qsize 6, out_qsize 2
2019-04-05 13:10:57,295 : INFO : EPOCH 3 - PROGRESS: at 84.08% examples, 311863 words/s, in_qsize 6, out_qsize 0
2019-04-05 13:10:58,293 : INFO : EPOCH 3 - PROGRESS: at 89.02% examples, 315188 words/s, in_qsize 6, out_qsize 0
2019-04-05 13:10:59,307 : INFO : EPOCH 3 - PROGRESS: at 93.32% examples, 315662 words/s, in_qsize 8, out_qsize 2
2019-04-05 13:11:00,337 : INFO : EPOCH 3 - PROGRESS: at 97.30% examples, 314822 words/s, in_qsize 5, out_qsize 5
2019-04-05 13:11:00,867 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-04-05 13:11:00,883 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-04-05 13:11:00,883 : I

In [7]:
# Load the model that we created 
model = KeyedVectors.load("300features_40minwords_10context")

2019-04-05 13:11:47,043 : INFO : loading Word2VecKeyedVectors object from 300features_40minwords_10context
2019-04-05 13:11:47,481 : INFO : loading wv recursively from 300features_40minwords_10context.wv.* with mmap=None
2019-04-05 13:11:47,481 : INFO : setting ignored attribute vectors_norm to None
2019-04-05 13:11:47,481 : INFO : loading vocabulary recursively from 300features_40minwords_10context.vocabulary.* with mmap=None
2019-04-05 13:11:47,481 : INFO : loading trainables recursively from 300features_40minwords_10context.trainables.* with mmap=None
2019-04-05 13:11:47,497 : INFO : setting ignored attribute cum_table to None
2019-04-05 13:11:47,497 : INFO : loaded 300features_40minwords_10context


In [8]:
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)      
    return feature_vector  
   
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features) 
                for tokenized_sentence in corpus]
    return np.array(features)

In [10]:
data = pd.read_csv("labeled_data.csv",encoding="latin-1")

clean_tweets = []
for tweet in data["tweet"]:
    clean_tweets.append( tweet_to_wordlist( tweet, remove_stopwords=True ))

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [11]:
encodings_data = averaged_word_vectorizer(corpus=clean_tweets, model=model, num_features=num_features)

X = pd.DataFrame(encodings_data)
y = data['class'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=69, test_size=0.33)

  if __name__ == '__main__':


Logistic Regression

In [19]:
pipe = Pipeline([('select', SelectFromModel(LogisticRegression(class_weight='balanced', penalty="l1", C=0.01))),
                 ('model', LogisticRegression(class_weight='balanced',penalty='l2'))])
param_grid = [{}]
grid_search = GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=5, random_state=69).split(X_train, y_train), verbose=2)
model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................
[CV] ................................................. , total=   9.5s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.7s remaining:    0.0s


[CV] ................................................. , total=   9.6s
[CV]  ................................................................
[CV] ................................................. , total=   9.2s
[CV]  ................................................................
[CV] ................................................. , total=   8.9s
[CV]  ................................................................
[CV] ................................................. , total=   9.3s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   47.7s finished


In [20]:
y_preds = model.predict(X_test)
report = classification_report( y_test, y_preds )
print(report)

             precision    recall  f1-score   support

          0       0.64      0.60      0.62      8186
          1       0.75      0.70      0.72     14805
          2       0.74      0.81      0.77     16957

avg / total       0.72      0.72      0.72     39948



Random Forest

In [22]:
pipe = Pipeline([('select', SelectFromModel(LogisticRegression(class_weight='balanced', penalty="l1", C=0.01))),
                 ('model', RandomForestClassifier(n_estimators=300, random_state=0))])
param_grid = [{}]
grid_search = GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=5, random_state=69).split(X_train, y_train), verbose=2)
model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................
[CV] ................................................. , total= 3.5min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.7min remaining:    0.0s


[CV] ................................................. , total= 4.0min
[CV]  ................................................................
[CV] ................................................. , total= 4.0min
[CV]  ................................................................
[CV] ................................................. , total= 3.9min
[CV]  ................................................................
[CV] ................................................. , total= 4.1min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 20.4min finished


In [23]:
y_preds = model.predict(X_test)
report = classification_report( y_test, y_preds )
print(report)

             precision    recall  f1-score   support

          0       0.85      0.61      0.71      8186
          1       0.84      0.81      0.83     14805
          2       0.78      0.91      0.84     16957

avg / total       0.82      0.81      0.81     39948



Linear SVC

In [25]:
pipe = Pipeline([('select', SelectFromModel(LogisticRegression(class_weight='balanced', penalty="l1", C=0.01))),
                 ('model', LinearSVC(C=0.05,random_state=0))])
param_grid = [{}] 
grid_search = GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=5, random_state=69).split(X_train, y_train), verbose=2)
model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................
[CV] ................................................. , total=   7.1s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.2s remaining:    0.0s


[CV] ................................................. , total=   6.7s
[CV]  ................................................................
[CV] ................................................. , total=   6.6s
[CV]  ................................................................
[CV] ................................................. , total=   6.5s
[CV]  ................................................................
[CV] ................................................. , total=   6.2s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   34.0s finished


In [26]:
y_preds = model.predict(X_test)
report = classification_report( y_test, y_preds )
print(report)

             precision    recall  f1-score   support

          0       0.75      0.45      0.56      8186
          1       0.72      0.71      0.72     14805
          2       0.71      0.86      0.77     16957

avg / total       0.72      0.72      0.71     39948



Extra Trees

In [28]:
pipe = Pipeline([('select', SelectFromModel(LogisticRegression(class_weight='balanced', penalty="l1", C=0.01))),
                 ('model', ExtraTreeClassifier())])
param_grid = [{}] 
grid_search = GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=5, random_state=69).split(X_train, y_train), verbose=2)
model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................
[CV] ................................................. , total=   5.5s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.7s remaining:    0.0s


[CV] ................................................. , total=   6.1s
[CV]  ................................................................
[CV] ................................................. , total=   5.5s
[CV]  ................................................................
[CV] ................................................. , total=   6.2s
[CV]  ................................................................
[CV] ................................................. , total=   5.5s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   29.9s finished


In [29]:
y_preds = model.predict(X_test)
report = classification_report( y_test, y_preds )
print(report)

             precision    recall  f1-score   support

          0       0.60      0.61      0.61      8186
          1       0.73      0.77      0.75     14805
          2       0.76      0.72      0.74     16957

avg / total       0.72      0.72      0.72     39948



Naive Bayes

In [31]:
pipe = Pipeline([('select', SelectFromModel(LogisticRegression(class_weight='balanced', penalty="l1", C=0.01))),
                 ('model', BernoulliNB())])

param_grid = [{}]
grid_search = GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=5, random_state=69).split(X_train, y_train), verbose=2)
model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................
[CV] ................................................. , total=   5.7s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.9s remaining:    0.0s


[CV] ................................................. , total=   5.7s
[CV]  ................................................................
[CV] ................................................. , total=   5.7s
[CV]  ................................................................
[CV] ................................................. , total=   5.5s
[CV]  ................................................................
[CV] ................................................. , total=   5.8s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   29.9s finished


In [32]:
y_preds = model.predict(X_test)
report = classification_report( y_test, y_preds )
print(report)

             precision    recall  f1-score   support

          0       0.56      0.45      0.50      8186
          1       0.66      0.68      0.67     14805
          2       0.72      0.77      0.75     16957

avg / total       0.67      0.67      0.67     39948

