In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!wget http://setup.johnsnowlabs.com/kaggle.sh -O - | bash 

In [None]:
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF, IDF, StringIndexer, IndexToString
from pyspark.ml.classification import RandomForestClassifier
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
import sparknlp

In [None]:
spark = sparknlp.start()

In [None]:
main_df = spark.read.csv(
    '/kaggle/input/corpus-of-russian-news-articles-from-lenta/lenta-ru-news.csv',
    header=True,
    multiLine=True,
    escape="\"")
main_df.show()

In [None]:
main_df.count()

In [None]:
filtered_df = main_df \
    .na.drop(subset=["topic"]) \
    .select(["text", "topic"]) \
    .withColumn("text", F.regexp_replace(F.col("text"), "[\n\r]", " ")) \
    .withColumn("text", F.regexp_replace(F.col("text"), ".Rambler Title ", "")) \
    .withColumn("text", F.trim(F.col("text")))
filtered_df.show()

In [None]:
filtered_df.count()

In [None]:
count_df = filtered_df\
    .groupBy("topic")\
    .count()\
    .orderBy(F.col("count").desc())
count_df.show()

In [None]:
count_df.count()

In [None]:
selected_rows = count_df\
    .select("topic")\
    .limit(5)\
    .collect()
selected_topics = [row.topic for row in selected_rows]
selected_topics

In [None]:
df = filtered_df\
    .filter(F.col("topic").isin(selected_topics))
df.show()

In [None]:
df.count()

In [None]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed = 1)

In [None]:
train_df.groupBy("topic")\
    .count()\
    .orderBy(F.col("count").desc())\
    .show()

In [None]:
test_df.groupBy("topic")\
    .count()\
    .orderBy(F.col("count").desc())\
    .show()

In [None]:
document_assembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

In [None]:
sentence_detector = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentence')

In [None]:
tokenizer = Tokenizer()\
    .setInputCols(['sentence'])\
    .setOutputCol('token')

In [None]:
stop_words_cleaner = StopWordsCleaner\
    .pretrained('stopwords_ru', 'ru')\
    .setInputCols(["token"]) \
    .setOutputCol("cleanTokens") \
    .setCaseSensitive(False)

In [None]:
lemmatizer = LemmatizerModel\
    .pretrained("lemma", "ru") \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma")

In [None]:
finisher = Finisher() \
    .setInputCols(["lemma"]) \
    .setOutputCols(["token_features"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(False)

In [None]:
hashing_TF = HashingTF(
    inputCol="token_features",
    outputCol="raw_features")

In [None]:
idf = IDF(
    inputCol="raw_features",
    outputCol="features",
    minDocFreq=5)

In [None]:
topic_indexer = StringIndexer(
    inputCol="topic",
    outputCol="label")

In [None]:
ran_forest = RandomForestClassifier(
    numTrees=10)

In [None]:
topic_to_string_indexer = IndexToString(
    inputCol="label",
    outputCol="article_class")

In [None]:
pipeline = Pipeline(
    stages=[
        document_assembler,
        sentence_detector,
        tokenizer,
        stop_words_cleaner,
        lemmatizer,
        finisher,
        hashing_TF,
        idf,
        topic_indexer,
        ran_forest,
        topic_to_string_indexer])

In [None]:
%%time
classification_model = pipeline.fit(train_df)

In [None]:
from sklearn.metrics import classification_report, accuracy_score

In [None]:
df_rf = classification_model \
    .transform(test_df) \
    .select("topic", "label", "prediction", "text")
df_rf_pandas = df_rf.toPandas()

In [None]:
labels_df = df_rf\
    .select("label", "topic")\
    .distinct()\
    .orderBy("label")
labels_df.show()

In [None]:
labels_raw = labels_df.collect()
labels = [row.topic for row in labels_raw]
labels

In [None]:
print(classification_report(
    df_rf_pandas.label, df_rf_pandas.prediction, target_names=labels))

In [None]:
print(accuracy_score(
    df_rf_pandas.label, df_rf_pandas.prediction))