# H2o model

In [1]:
#import packages
import numpy as np
import pandas as pd 
#import matplotlib as mpl
import h2o
from h2o.automl import H2OAutoML
from h2o.estimators import H2OWord2vecEstimator, H2OGradientBoostingEstimator
import seaborn as sns
import matplotlib.pyplot as plt

In [1]:
h2o.init()

In [1]:
job_titles = h2o.import_file('../input/commonlitreadabilityprize/train.csv')
test = h2o.import_file('../input/commonlitreadabilityprize/test.csv')
sample_submission = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

In [1]:
print(job_titles.shape)
print(test.shape)

In [1]:
#desscribe dataset
job_titles.head()

In [1]:
STOP_WORDS = ["ax","i","you","edu","s","t","m","subject","can",
              "lines","re","what","there","all","we","one","the",
              "a","an","of","or","in","for","by","on","but","is",
              "in","a","not","with","as","was","if","they","are",
              "this","and","it","have","from","at","my","be","by",
              "not","that","to","from","com","org","like","likes",
              "so"]

In [1]:
def tokenize(sentences, stop_word = STOP_WORDS):
    tokenized = sentences.tokenize("\\W+")
    tokenized_lower = tokenized.tolower()
    tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]
    tokenized_words = tokenized_filtered[tokenized_filtered.grep("[0-9]",invert=True,output_logical=True),:]
    tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(STOP_WORDS)),:]
    return tokenized_words

In [1]:
def predict(job_title,w2v, gbm):
    words = tokenize(h2o.H2OFrame(job_title).ascharacter())
    job_title_vec = w2v.transform(words, aggregate_method="AVERAGE")
    print(gbm.predict(test_data=job_title_vec))
    return (gbm.predict(test_data=job_title_vec))

In [1]:
words = tokenize(job_titles["excerpt"])

In [1]:
w2v_model = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 2)
w2v_model.train(training_frame=words)

In [1]:
w2v_model.find_synonyms("teacher", count = 5)

In [1]:
# Calculate a vector for each job title:
job_title_vecs = w2v_model.transform(words, aggregate_method = "AVERAGE")

In [1]:
# Prepare training & validation data (keep only job titles made of known words):
valid_job_titles = ~ job_title_vecs["C4"].isna()
data = job_titles[valid_job_titles,:].cbind(job_title_vecs[valid_job_titles,:])
data_split = data.split_frame(ratios=[0.8])

In [1]:
# Build a basic GBM model:
gbm_model = H2OGradientBoostingEstimator()
gbm_model.train(x = job_title_vecs.names,
                y="target",
                training_frame = data_split[0],
                validation_frame = data_split[1])

In [1]:
perf = gbm_model.model_performance()
perf

In [1]:
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
test["target"] = float(1)
test1=np.zeros(7)

In [1]:
# Predict
for i in range(0,7):
    print(test["target"][i])
    a=predict([test["excerpt"][i]],w2v_model, gbm_model)
    test["target"][i]=a["predict"]
    print(test["target"][i])
#print(predict(["school teacher having holidays every month"], w2v_model, gbm_model))
a

In [1]:
test["target"]

In [1]:
sample_submission["target"]=test["target"]
sample_submission.head()

In [1]:
sample_submission.to_csv('submission.csv', index=False)