In [1]:
import pandas as pd
import numpy as np
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
all_data = pd.read_csv('all_data.csv')
df_categories = pd.read_csv('./input/items.csv')

In [3]:
tmp = all_data.copy()
tmp = pd.merge(tmp,df_categories, on=['item_id','item_category_id'])

In [4]:
#Stemming
stemmer = SnowballStemmer("russian")
tmp['stemmed'] = tmp.item_name.map(lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))

In [5]:
ru_sw = np.loadtxt('stopwords_ru.txt',dtype='str',comments="#", delimiter=",", unpack=False,encoding='ISO-8859-1')
eng_sw = np.loadtxt('stopwords_eng.txt',dtype='str',comments="#", delimiter=",", unpack=False)
total_sw = list(ru_sw) + list(eng_sw)

In [6]:
tvec = TfidfVectorizer(min_df=.0025, max_df=.1, stop_words=total_sw, ngram_range=(1,2))
tvec_weights = tvec.fit_transform(tmp.stemmed.dropna())
weights = np.asarray(tvec_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': tvec.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,term,weight
333,регион,0.059992
370,цифров,0.027549
371,цифров версия,0.027038
140,pc цифров,0.025213
336,русск версия,0.021941
29,3d,0.019653
109,jewel,0.0192
52,cd,0.017672
119,lp,0.016959
125,mp3,0.016834


In [7]:
combined = tmp.copy()
combined = pd.concat([combined,weights_df],axis=1)

In [8]:
combined.to_csv('combined.csv',index=False)