In [2]:
from collections import defaultdict
import pyspark
from pyspark.sql import SparkSession
import pyspark.pandas as ps
from pyspark.sql.functions import split, col
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.types import IntegerType, FloatType
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics
import matplotlib.pyplot as plt
from pyspark.ml.feature import Word2Vec

sc = (
    SparkSession
    .builder
    .master("local[2]")
    .appName("hw05")
    .getOrCreate()
)


In [13]:
dtrain = (
    sc
    .read
    .format("csv")
    .option("header",True)
    .option("quote", "\"")
    .option("escape", "\"")
    .option("multiline", True)
    .load("file:///home/jovyan/work/train.csv")
    .withColumn("toxic",col("toxic").cast("integer"))
    .withColumn("severe_toxic",col("severe_toxic").cast("integer"))
    .withColumn("obscene",col("obscene").cast("integer"))
    .withColumn("threat",col("threat").cast("integer"))
    .withColumn("insult",col("insult").cast("integer"))
    .withColumn("identity_hate",col("identity_hate").cast("integer"))
)


dtest = (
    sc
    .read
    .format("csv")
    .option("header",True)
    .option("quote", "\"")
    .option("escape", "\"")
    .option("multiline", True)
    .load("file:///home/jovyan/work/test.csv")
    # .withColumn("toxic",col("toxic").cast("integer"))
    # .withColumn("severe_toxic",col("severe_toxic").cast("integer"))
    # .withColumn("obscene",col("obscene").cast("integer"))
    # .withColumn("threat",col("threat").cast("integer"))
    # .withColumn("insult",col("insult").cast("integer"))
    # .withColumn("identity_hate",col("identity_hate").cast("integer"))
)

dlabels = (
    sc
    .read
    .format("csv")
    .option("header",True)
    .load("file:///home/jovyan/work/test_labels.csv")
    .withColumn("toxic",col("toxic").cast("integer"))
    .withColumn("severe_toxic",col("severe_toxic").cast("integer"))
    .withColumn("obscene",col("obscene").cast("integer"))
    .withColumn("threat",col("threat").cast("integer"))
    .withColumn("insult",col("insult").cast("integer"))
    .withColumn("identity_hate",col("identity_hate").cast("integer"))
)

dtest = dtest.join(dlabels, "id")

tokenizer = Tokenizer(inputCol="comment_text", outputCol="comment_words")
dtrain = tokenizer.transform(dtrain)
dtest = tokenizer.transform(dtest)


targets = [
    'toxic', 'severe_toxic', 'obscene',
    'threat', 'insult', 'identity_hate',
]

## Подготовить фичи комментариев с помощью w2v


In [None]:

word2Vec = Word2Vec(
    vectorSize=5, 
    seed=42, 
    inputCol="comment_words", 
    outputCol="model"
)
model = word2Vec.fit(dtrain)


In [16]:
dtrain2 = model.transform(dtrain)

In [14]:
dtest2 = model.transform(dtest)

In [15]:
dtest2

DataFrame[id: string, comment_text: string, toxic: int, severe_toxic: int, obscene: int, threat: int, insult: int, identity_hate: int, comment_words: array<string>, model: vector]

## Обучить линейный классификатор 

In [23]:
metrics  = dict()

# target = "toxic"
for target in targets:
    lr = LogisticRegression(labelCol=target).setFeaturesCol("model")
    lrn = lr.fit(dtrain2)
    dtest3 = dtest2[dtest2[target]!=-1]
    dtest3 = lrn.transform(dtest3)
    bce = BinaryClassificationEvaluator(labelCol=target)
    auroc = bce.evaluate(dtest3)
    metrics[target] = auroc

In [24]:
metrics

{'toxic': 0.8658867165268348,
 'severe_toxic': 0.9444494395152192,
 'obscene': 0.8882706371959597,
 'threat': 0.8663699530510836,
 'insult': 0.883996424875139,
 'identity_hate': 0.8987469128917804}

# Выводы
Признаки w2v дают более высокую метрику качества ROCAUC по сравлению с TF-IDF из блока 1. При этом w2v требует меньше времени для обучения.

|target       |tf-idf|w2v  |
|-------------|------|-----|
|toxic        |0.790 |0.865|
|severe_toxic |0.841 |0.944|
|obscene      |0.820 |0.888|
|threat       |0.866 |0.866|
|insult       |0.816 |0.883|
|identity_hate|0.776 |0.899|