# HaHackathon

In [11]:
from __future__ import print_function, division
from builtins import range

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import pickle
import os
import gc
import sys
import scipy

from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import ComplementNB, MultinomialNB
from sklearn import metrics
from sklearn.metrics import confusion_matrix,accuracy_score, roc_auc_score, roc_curve, auc, f1_score

In [12]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,id,text,is_humor,humor_rating,humor_controversy,offense_rating
0,1,TENNESSEE: We're the best state. Nobody even c...,1,2.42,1.0,0.2
1,2,A man inserted an advertisement in the classif...,1,2.5,1.0,1.1
2,3,How many men does it take to open a can of bee...,1,1.95,0.0,2.4
3,4,Told my mom I hit 1200 Twitter followers. She ...,1,2.11,1.0,0.0
4,5,Roses are dead. Love is fake. Weddings are bas...,1,2.78,0.0,0.1


In [13]:
X = data['text']
y = data['is_humor']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

## Glove Embedding

In [14]:
# Course URL:
# https://deeplearningcourses.com/c/natural-language-processing-with-deep-learning-in-python
# https://udemy.com/natural-language-processing-with-deep-learning-in-python

glove_vectors = "glove.6B.300d.txt"

class GloveVectorizer:
  def __init__(self):
    # load in pre-trained word vectors
    print('Loading word vectors...')
    word2vec = {}
    embedding = []
    idx2word = []
    with open(glove_vectors, encoding="utf8") as f:
      # is just a space-separated text file in the format:
      # word vec[0] vec[1] vec[2] ...
      for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
        embedding.append(vec)
        idx2word.append(word)
    print('Found %s word vectors.' % len(word2vec))

    # save for later
    self.word2vec = word2vec
    self.embedding = np.array(embedding)
    self.word2idx = {v:k for k,v in enumerate(idx2word)}
    self.V, self.D = self.embedding.shape

  def fit(self, data):
    pass

  def transform(self, data):
    X = np.zeros((len(data), self.D))
    n = 0
    emptycount = 0
    for sentence in data:
      tokens = sentence.lower().split()
      vecs = []
      for word in tokens:
        if word in self.word2vec:
          vec = self.word2vec[word]
          vecs.append(vec)
      if len(vecs) > 0:
        vecs = np.array(vecs)
        X[n] = vecs.mean(axis=0)
      else:
        emptycount += 1
      n += 1
    print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
    return X

  def fit_transform(self, data):
    self.fit(data)
    return self.transform(data)




class Word2VecVectorizer:
  def __init__(self):
    print("Loading in word vectors...")
    self.word_vectors = KeyedVectors.load_word2vec_format(
      '../large_files/GoogleNews-vectors-negative300.bin',
      binary=True
    )
    print("Finished loading in word vectors")

  def fit(self, data):
    pass

  def transform(self, data):
    # determine the dimensionality of vectors
    v = self.word_vectors.get_vector('king')
    self.D = v.shape[0]

    X = np.zeros((len(data), self.D))
    n = 0
    emptycount = 0
    for sentence in data:
      tokens = sentence.split()
      vecs = []
      m = 0
      for word in tokens:
        try:
          # throws KeyError if word not found
          vec = self.word_vectors.get_vector(word)
          vecs.append(vec)
          m += 1
        except KeyError:
          pass
      if len(vecs) > 0:
        vecs = np.array(vecs)
        X[n] = vecs.mean(axis=0)
      else:
        emptycount += 1
      n += 1
    print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
    return X


  def fit_transform(self, data):
    self.fit(data)
    return self.transform(data)



vectorizer = GloveVectorizer()
# vectorizer = Word2VecVectorizer()
Xtrain = vectorizer.fit_transform(X_train)
Ytrain = y_train

Xtest = vectorizer.transform(X_test)
Ytest = y_test

Loading word vectors...
Found 400000 word vectors.
Numer of samples with no words found: 2 / 6000
Numer of samples with no words found: 0 / 2000


## Trees

In [15]:
# create the model, train it, print scores
#model = RandomForestClassifier(n_estimators=200)
etc_model = ExtraTreesClassifier(n_estimators=200)
etc_model.fit(Xtrain, Ytrain)
print("train score:", etc_model.score(Xtrain, Ytrain))
print("test score:", etc_model.score(Xtest, Ytest))

train score: 1.0
test score: 0.8155


In [16]:
# create the model, train it, print scores
rfc_model = RandomForestClassifier(n_estimators=200)
rfc_model.fit(Xtrain, Ytrain)
print("train score:", rfc_model.score(Xtrain, Ytrain))
print("test score:", rfc_model.score(Xtest, Ytest))

train score: 1.0
test score: 0.804


## Boosting

In [19]:
# XGBoost
# create the model, train it, print scores
#model = RandomForestClassifier(n_estimators=200)
xgb_model = XGBClassifier(objective="binary:logistic", eval_metric="auc")
xgb_model.fit(Xtrain, Ytrain)
print("train score:", xgb_model.score(Xtrain, Ytrain))
print("test score:", xgb_model.score(Xtest, Ytest))

train score: 1.0
test score: 0.817


In [22]:
# CatBooost
#cat_features = [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23]
params = {'loss_function':'Logloss',
          'eval_metric':'AUC:hints=skip_train~false',
        #  'cat_features': cat_features,
        #'task_type': "GPU",
          'verbose': 0,
        #  'border_count': 254,
          'random_seed': 42
         }
cb_model = CatBoostClassifier(**params)
cb_model.fit(Xtrain, Ytrain)
print("train score:", cb_model.score(Xtrain, Ytrain))
print("test score:", cb_model.score(Xtest, Ytest))

train score: 0.9868333333333333
test score: 0.8335


## Test Model

In [24]:
model = xgb_model

print(f"AUC: {roc_auc_score(Ytest, model.predict_proba(Xtest)[:, 1])}")
print(f"Accuracy: {model.score(Xtest, Ytest)}")
print(f"F1 score: {f1_score(Ytest, model.predict(Xtest))}")

AUC: 0.8947373986344666
Accuracy: 0.817
F1 score: 0.8568075117370891


## Submission

In [29]:
public_data = pd.read_csv("public_test.csv")
#public_data.head()
public_X = public_data['text']
public_X.head()
public_Xtest = vectorizer.transform(public_X)

Numer of samples with no words found: 0 / 1000


In [37]:
is_humor_pred = cb_model.predict_proba(public_Xtest)[:,1]
test_predictions =[round(value) for value in is_humor_pred]
submission = pd.DataFrame({"id":public_data["id"],"is_humor":test_predictions})
print(submission.head())

     id  is_humor
0  9001         1
1  9002         0
2  9003         1
3  9004         1
4  9005         1


In [38]:
filename = "jin_is_humor_GloVe_CatBoost.csv"
submission.to_csv(filename,index=False)