In [1]:
import pandas as pd
import numpy as np
import scipy
import pyspark
import os
import findspark
from tqdm import tqdm
from pyspark.sql import functions as F
from pyspark.sql.functions import lit, row_number, col
from pyspark.sql.functions import collect_list
from pyspark import SparkContext, SparkConf, HiveContext
from pyspark.mllib.feature import Word2Vec
import warnings

warnings.filterwarnings('ignore')

In [2]:
spark_location = '/Users/arkadyvasilenko/spark-2.4.5-bin-hadoop2.7' # Set your own
java8_location = '/library/java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home' # Set your own
os.environ['JAVA_HOME'] = java8_location
findspark.init(spark_home = spark_location) 

In [3]:
exec(open(os.path.join("/usr/local/Cellar/apache-spark/2.4.5/libexec/python/pyspark/shell.py")).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.5
      /_/

Using Python version 3.7.4 (default, Aug 13 2019 15:17:50)
SparkSession available as 'spark'.


In [4]:
sc = SparkContext.getOrCreate()
hive = HiveContext(sc)

In [5]:
pays = spark.read.csv('../okved_test/data/pays.csv',inferSchema = True, header = True)

## Prepare data

In [6]:
pays = pays.withColumn("hash_inn_kt", pays["hash_inn_kt"].cast('string'))
inns = pays.select('hash_inn_kt').distinct().rdd.map(lambda r: r[0]).collect()

In [7]:
grouped_df = pays.groupby('hash_inn_kt').agg(collect_list('hash_inn_dt').alias("hash_inn_dt"))
train_inn = grouped_df.select('hash_inn_dt').collect()

In [8]:
str_list = []
for row in train_inn:
    str_list.append(" ".join(str(x) for x in row.hash_inn_dt))

## Word2Vec

In [9]:
count_inn = 100
vec_size = 5

localDoc = [",".join(str(x) for x in str_list[:count_inn])]
doc = sc.parallelize(localDoc).map(lambda line: line.split(" "))
word2vec = Word2Vec()

#params
word2vec.setVectorSize(vec_size)
#word2vec.setNumIterations(10)
#word2vec.setNumPartitions(10)

model = word2vec.fit(doc)

## Vectors to df

In [10]:
vectors_ = model.getVectors()
vectors = {k: list([x for x in vectors_.get(k)])
    for k in vectors_.keys()}

embed_df = pd.DataFrame(data=vectors).T
embed_df.columns = ['col' + str(i) for i in range(1, vec_size + 1)]

embed_df['hash_inn'] = embed_df.index.astype('int')
embed_df.reset_index(drop = True, inplace = True)

## Get okved

In [11]:
public_df = pd.read_csv('../data/inn_info_public.csv')
private_df = pd.read_csv('../data/inn_info_private.csv')
full = pd.concat([public_df.loc[public_df.okved2 != -1], private_df], axis=0)

In [12]:
df = full[['hash_inn','okved2']].merge(embed_df, left_on='hash_inn', right_on='hash_inn')
df.head()

Unnamed: 0,hash_inn,okved2,col1,col2,col3,col4,col5
0,3736,34,0.392305,-0.157705,-0.350331,-0.431061,-0.012637
1,204639,34,0.176783,-1.020083,-0.350738,0.17988,0.839318
2,123551,14,-0.376219,-0.931577,-1.53201,0.414244,0.365705
3,257788,12,0.439426,-0.296783,-0.072623,0.383588,0.00054
4,113791,34,1.027697,0.781429,-0.198442,-0.027294,0.053618
