In [None]:
import os
import cx_Oracle
import pandas as pd
import numpy as np
import datetime as dt
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import time
import gc

#Set up DB connection
os.environ['ORACLE_HOME'] = "oraclepath"
dns_tns = cx_Oracle.makedsn('ip','7777',service_name = 'servicename')
usr = getpass.getpass(prompt='Insert username:\n')
pwd = getpass.getpass(prompt='Insert password:\n')
conn = cx_Oracle.connect(user=usr, password=pwd, dsn=dns_tns, encoding='utf-8')

In [None]:
logs = pd.read_sql('''''',con=conn)

In [None]:
import gensim
from nltk.stem.snowball import *
from nltk.stem import WordNetLemmatizer
import pymorphy2
from bs4 import BeautifulSoup
import re

stemmer = SnowballStemmer('russian')
date_list=['года','месяца','января','февраля','марта','апреля','мая','июня','июля','августа','сентября','октября','ноября','декабря',
          'месяце','январе','феврале','марте','апреле','мае','июне','июле','августе','сентябре','октябре','ноябре','декабре']
rubles_list=['руб']
location_list=['ул','улица','кор','пр','пр-кт','проезд',"проспект",'гор',"пр-т"]

def tfidf_clean_text(text):
    text=text.lower()
    text = BeautifulSoup(text,'lxml').text
    text = re.sub(r'\|\|\|',r' ', text)
    text = re.sub(r'http\S+',r'<URL>', text)
    text = text.replace('x','')
    text = text.replace('\\n',' ')
    text = text.replace('%',' <проценты>')
    return text

morph= pymorphy2.MorphAnalyzer()
    
def tfidf_preprocess(text):
    result = []
    text = tfidf_clean_text(text)
    for token in gensim.utils.simple_preprocess(text,min_len=2,max_len=30):
        if token not in stopwords.words('russian') :
            if token in date_list:
                token='<дата>'
                result.append(token)
            elif token in rubles_list:
                token='рублей'
                result.append(token)
            elif token in location_list:
                token='<локация>'
                result.append(token)
            else:
                #stemmed = stemmer.stem(WordNetLemmatizer().lemmatize(token,pos='v'))
                #result.append(stemmed)
                norm=morph.parse(token)[0].normal_form
                if norm not in stopwords.words('russian'):
                    result.append(token)
    return result

In [None]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf= TfidfVectorizer( use_idf=True, tokenizer=tfidf_preprocess, analyzer='word', ngram_range=(1,2), max_df=0.8, min_df=5)
tfidf.fit(logs['C_SCRIPT'])

In [None]:
%%time
x_tf_train = tfidf.transform(logs['C_SCRIPT'])

In [None]:
%%time
import multiprocessing as mp
import scipy.sparse as sp

num_partitions=36
num_workers=18

def parallelize_dataframe(df,func):
    df_split = np.array_split(df,num_partitions)
    del df
    pool = mp.Pool(num_workers)
    print('Start mapping')
    df =sp.vstack(pool.map(func,df_split),format='csr')
    print('Concat together')
    pool.close()
    pool.join()
    return df

def func(df):
    print('Apply to partition')
    tfidf_matrix = tfidf.transform(df['C_SCRIPT'])
    return tfidf_matrix

x_tf_train_parallel = parallelize_dataframe(logs,func)                 

In [None]:
y_tf_train = logs['OPERATOR']

In [None]:
tfidf.vocabulary_

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

logreg = LogisticRegression(n_jobs=-1)
acc=cross_val_score(logreg,x_tf_train,y_tf_train,scoring='f1_macro',cv=5)

In [None]:
acc

In [None]:
logreg.fit(x_tf_train,y_tf_train)

In [None]:
n=30
feature_names = tfidf.get_feature_names()
coefs_with_fns = sorted(zip(logreg.coef_[0],feature_names))
top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n+1):-1])
for (coef_1,fn_1), (coef_2,fn_2) in top:
    print ("\t%.4f\t%-15s\t\t%.4f\t%-15s" %(coef_1,fn_1,coef_2,fn_2))

In [None]:
toplot={v:k for k,v in coefs_with_fns[-100:]}

In [None]:
from PIL import Image
mask = np.array(Image.open("borders.png"))

In [None]:
toplot

In [None]:
from wordcloud import WordCloud
wordcloud = WordCloud(width=1000,height=1000, max_words=60, background_color='white', colormap='plasma',mask=mask).generate_from_frequencies(toplot)

plt.rcParams.update({'font.size':20})
plt.figure(figsize=(15,15))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
