In [31]:
import pandas as pd
import re
import plotly.graph_objects as go
#import yfinance as yf
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import nltk
import skopt
import joblib
#import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from tqdm import tqdm
import pymorphy2

In [42]:
data = pd.read_csv('USDRUB_Days.csv', parse_dates=['<DATE>'])
data.columns = ['TICKER','PER','date','time','Close']
data['Close1']=data['Close'].shift(-1)
data['target']=np.sign(data['Close1']-data['Close'])
data.reset_index(drop=True, inplace=True)
target=data[['date', 'target']]
target['key_date'] = target['date'].dt.strftime('%Y-%m-%d')
target=target.loc[target['date']<'2021']
target

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target['key_date'] = target['date'].dt.strftime('%Y-%m-%d')


Unnamed: 0,date,target,key_date
0,2016-01-01,0.0,2016-01-01
1,2016-01-02,1.0,2016-01-02
2,2016-01-03,-1.0,2016-01-03
3,2016-01-04,1.0,2016-01-04
4,2016-01-05,1.0,2016-01-05
...,...,...,...
1745,2020-12-27,-1.0,2020-12-27
1746,2020-12-28,1.0,2020-12-28
1747,2020-12-29,1.0,2020-12-29
1748,2020-12-30,-1.0,2020-12-30


In [28]:
df=pd.read_csv('VBR_new.csv', encoding='utf-16')
df = df[df['tweet'].notna()]
df.reset_index(drop=True, inplace=True)
df['date']=pd.to_datetime(df['date'])
def clean(sen):
    sentence = re.sub("(http|https|ftp)\://([a-zA-Z0-9\-\.]+\.+[a-zA-Z]{2,3})(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+&amp;%\$#\=~]*)[^\.\,\)\(\s]?", ' ', sen)
    sentence = re.sub('[^a-zA-Zа-яА-Яё]', ' ', sentence)
    sentence = re.sub(r'\s+[a-zA-Z]\s+', ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)
    sentence = sentence.strip()
    sentence = sentence.lower()
    return sentence
df['tweet'] = df['tweet'].apply(clean)
df['key_date'] = df['date'].dt.strftime('%Y-%m-%d')
df = pd.merge(df,target[['key_date','target']], on='key_date', how='left')
df = df[df['target'].notna()]
df = df.loc[df['target']!=0]
data = data.loc[data['target']!=0]

In [29]:
df.to_csv('clean_tweets.csv', index=False)

In [32]:
df_lem = df.copy()
morph = pymorphy2.MorphAnalyzer()
def lemmatize(text):
    words = nltk.word_tokenize(text)
    res = []
    for word in words:
        p = morph.parse(word)[0]
        res.append(p.normal_form)
    return ' '.join(res)

tqdm.pandas()
df_lem['tweet'] = df_lem['tweet'].progress_apply(lemmatize)

100%|██████████| 81316/81316 [03:20<00:00, 406.50it/s]


In [33]:
df_lem.to_csv('clean_tweets_lem.csv', index=False)

In [50]:
y_train = list(df.loc[df['date']<'2020-10-01','target'])
y_test = list(df.loc[df['date']>='2020-10-01','target'])
X_train = list(df.loc[df['date']<'2020-10-01','tweet'])
X_test = list(df.loc[df['date']>='2020-10-01','tweet'])

y_train_lem = list(df_lem.loc[df_lem['date']<'2020-10-01','target'])
y_test_lem = list(df_lem.loc[df_lem['date']>='2020-10-01','target'])
X_train_lem = list(df_lem.loc[df_lem['date']<'2020-10-01','tweet'])
X_test_lem = list(df_lem.loc[df_lem['date']>='2020-10-01','tweet'])

target=target.loc[target['date']<'2021']
target_test=target.loc[target['date']>='2020-10-01']
target_test.reset_index(drop=True, inplace=True)
day_y_test=list(target.loc[target['date']>='2020-10-01','target'])
day_y_test=day_y_test[:-1]
day_x_test=[]
day_x_test_lem=[]
for i in range(1,len(day_y_test)):
    day_x_test.append(df.loc[(df['date']<target_test.loc[i,'key_date']) & (df['date']>=target_test.loc[i-1,'key_date'])])
    day_x_test_lem.append(df_lem.loc[(df_lem['date']<target_test.loc[i,'key_date']) & (df_lem['date']>=target_test.loc[i-1,'key_date'])])

In [35]:
stopwords=nltk.corpus.stopwords.words('russian')

In [54]:
def metric(proba):
    respred = 0
    resproba = 0
    for i in proba:
        respred+=np.sign(i[1]-i[0])
        resproba+=2*i[1]
    resproba=resproba/len(proba)-1
    respred=respred/len(proba)
    return respred, resproba


def mod_construct(lem=False,tfidf=False,ngram_range='13',stop_words=False,max_features=None,
                  penalty='l2',fit_intercept=True,class_weight='balanced',C=1,res='score'):
    ngram_range = [int(ngram_range[0]), int(ngram_range[1])]
    if stop_words:
        stop_words=stopwords
    else:
        stop_words=None
    if tfidf:
        vec = TfidfVectorizer(ngram_range=ngram_range, stop_words=stop_words, max_features=max_features)
    else:
        vec = CountVectorizer(ngram_range=ngram_range, stop_words=stop_words, max_features=max_features)
    if lem:
        X = X_train_lem
        y = y_train_lem.copy()
        day_x = day_x_test_lem
    else:
        X = X_train
        y = y_train.copy()
        day_x = day_x_test
    X = vec.fit_transform(X)
    model = LogisticRegression(penalty=penalty, fit_intercept=fit_intercept, class_weight=class_weight, C=C,
                               random_state=42, solver='liblinear', max_iter=500)
    model.fit(X,y)
    day_test_predict = []
    respred_ac=0
    resproba_ac=0
    n=0
    zero=0
    for i in day_x:
        if len(i)>=20:
            buf_x = vec.transform(i['tweet'])
            buf_proba = model.predict_proba(buf_x)
            buf_day_pred = metric(buf_proba)
            if np.sign(buf_day_pred[0])==day_y_test[n]:
                respred_ac+=1
            if np.sign(buf_day_pred[1])==day_y_test[n]:
                resproba_ac+=1
            day_test_predict.append((n,buf_day_pred))
        else:
            zero+=1
        n+=1

    respred_ac = respred_ac/(len(day_x)-zero)
    resproba_ac = resproba_ac/(len(day_x)-zero)
    f1 = f1_score(y_test, model.predict(vec.transform(X_test)))
    print(f1, respred_ac, resproba_ac)
    if res=='score':
        return -max(respred_ac, resproba_ac)
    elif res=='model':
        return vec, model


In [56]:
SPACE = [
    skopt.space.Categorical([False, True], name='lem'),
    skopt.space.Categorical([False, True], name='tfidf'),
    skopt.space.Categorical(['11','12','13','22','23','33'], name='ngram_range'),
    skopt.space.Categorical([False, True], name='stop_words'),
    skopt.space.Categorical([50000,100000,None], name='max_features'),
    skopt.space.Categorical([False, True], name='fit_intercept'),
    skopt.space.Categorical([None, 'balanced'], name='class_weight'),
    skopt.space.Real(0.0001, 50, name='C')
]

@skopt.utils.use_named_args(SPACE)
def objective(**params):
    return mod_construct(**params)

result = skopt.gp_minimize(objective, SPACE, n_calls=400, n_random_starts=200, n_jobs=-1, verbose=True)
best_metric = result.fun
best_param = result.x

Iteration No: 1 started. Evaluating function at random point.
0.4921077065923862 0.42028985507246375 0.4492753623188406
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 4.3916
Function value obtained: -0.4493
Current minimum: -0.4493
Iteration No: 2 started. Evaluating function at random point.
0.49696969696969695 0.4782608695652174 0.5217391304347826
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 5.2948
Function value obtained: -0.5217
Current minimum: -0.5217
Iteration No: 3 started. Evaluating function at random point.
0.42105263157894735 0.5362318840579711 0.4782608695652174
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 3.0562
Function value obtained: -0.5362
Current minimum: -0.5362
Iteration No: 4 started. Evaluating function at random point.
0.4987025241802312 0.5362318840579711 0.5507246376811594
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 2.3382
Function value obtained: -0.5507
Current minimu



0.49583234103357937 0.5507246376811594 0.6376811594202898
Iteration No: 210 ended. Search finished for the next optimal point.
Time taken: 7.7186
Function value obtained: -0.6377
Current minimum: -0.6377
Iteration No: 211 started. Searching for the next optimal point.
0.49499761791329205 0.5362318840579711 0.6376811594202898
Iteration No: 211 ended. Search finished for the next optimal point.
Time taken: 7.9322
Function value obtained: -0.6377
Current minimum: -0.6377
Iteration No: 212 started. Searching for the next optimal point.




0.49583234103357937 0.5507246376811594 0.6376811594202898
Iteration No: 212 ended. Search finished for the next optimal point.
Time taken: 7.9722
Function value obtained: -0.6377
Current minimum: -0.6377
Iteration No: 213 started. Searching for the next optimal point.




0.49583234103357937 0.5507246376811594 0.6376811594202898
Iteration No: 213 ended. Search finished for the next optimal point.
Time taken: 8.2706
Function value obtained: -0.6377
Current minimum: -0.6377
Iteration No: 214 started. Searching for the next optimal point.




0.49583234103357937 0.5507246376811594 0.6376811594202898
Iteration No: 214 ended. Search finished for the next optimal point.
Time taken: 8.5899
Function value obtained: -0.6377
Current minimum: -0.6377
Iteration No: 215 started. Searching for the next optimal point.




0.49499761791329205 0.5362318840579711 0.6376811594202898
Iteration No: 215 ended. Search finished for the next optimal point.
Time taken: 7.9318
Function value obtained: -0.6377
Current minimum: -0.6377
Iteration No: 216 started. Searching for the next optimal point.




0.49583234103357937 0.5507246376811594 0.6376811594202898
Iteration No: 216 ended. Search finished for the next optimal point.
Time taken: 8.3520
Function value obtained: -0.6377
Current minimum: -0.6377
Iteration No: 217 started. Searching for the next optimal point.




0.49583234103357937 0.5507246376811594 0.6376811594202898
Iteration No: 217 ended. Search finished for the next optimal point.
Time taken: 8.0432
Function value obtained: -0.6377
Current minimum: -0.6377
Iteration No: 218 started. Searching for the next optimal point.




0.49583234103357937 0.5507246376811594 0.6376811594202898
Iteration No: 218 ended. Search finished for the next optimal point.
Time taken: 7.9726
Function value obtained: -0.6377
Current minimum: -0.6377
Iteration No: 219 started. Searching for the next optimal point.




0.49583234103357937 0.5507246376811594 0.6376811594202898
Iteration No: 219 ended. Search finished for the next optimal point.
Time taken: 7.8997
Function value obtained: -0.6377
Current minimum: -0.6377
Iteration No: 220 started. Searching for the next optimal point.
0.49679715302491106 0.5217391304347826 0.5797101449275363
Iteration No: 220 ended. Search finished for the next optimal point.
Time taken: 15.6603
Function value obtained: -0.5797
Current minimum: -0.6377
Iteration No: 221 started. Searching for the next optimal point.
0.4965484408474173 0.5652173913043478 0.6376811594202898
Iteration No: 221 ended. Search finished for the next optimal point.
Time taken: 7.4831
Function value obtained: -0.6377
Current minimum: -0.6377
Iteration No: 222 started. Searching for the next optimal point.




0.49583234103357937 0.5507246376811594 0.6376811594202898
Iteration No: 222 ended. Search finished for the next optimal point.
Time taken: 7.9066
Function value obtained: -0.6377
Current minimum: -0.6377
Iteration No: 223 started. Searching for the next optimal point.




0.49499761791329205 0.5362318840579711 0.6376811594202898
Iteration No: 223 ended. Search finished for the next optimal point.
Time taken: 8.4251
Function value obtained: -0.6377
Current minimum: -0.6377
Iteration No: 224 started. Searching for the next optimal point.




0.49583234103357937 0.5507246376811594 0.6376811594202898
Iteration No: 224 ended. Search finished for the next optimal point.
Time taken: 7.7850
Function value obtained: -0.6377
Current minimum: -0.6377
Iteration No: 225 started. Searching for the next optimal point.




0.49583234103357937 0.5507246376811594 0.6376811594202898
Iteration No: 225 ended. Search finished for the next optimal point.
Time taken: 7.7661
Function value obtained: -0.6377
Current minimum: -0.6377
Iteration No: 226 started. Searching for the next optimal point.
0.498455690187693 0.5652173913043478 0.6376811594202898
Iteration No: 226 ended. Search finished for the next optimal point.
Time taken: 7.5129
Function value obtained: -0.6377
Current minimum: -0.6377
Iteration No: 227 started. Searching for the next optimal point.
0.49666666666666665 0.5652173913043478 0.6376811594202898
Iteration No: 227 ended. Search finished for the next optimal point.
Time taken: 7.8356
Function value obtained: -0.6377
Current minimum: -0.6377
Iteration No: 228 started. Searching for the next optimal point.
0.49762131303520457 0.5507246376811594 0.6376811594202898
Iteration No: 228 ended. Search finished for the next optimal point.
Time taken: 8.3862
Function value obtained: -0.6377
Current minimum:



0.49583234103357937 0.5507246376811594 0.6376811594202898
Iteration No: 229 ended. Search finished for the next optimal point.
Time taken: 8.1471
Function value obtained: -0.6377
Current minimum: -0.6377
Iteration No: 230 started. Searching for the next optimal point.




0.49583234103357937 0.5507246376811594 0.6376811594202898
Iteration No: 230 ended. Search finished for the next optimal point.
Time taken: 8.1118
Function value obtained: -0.6377
Current minimum: -0.6377
Iteration No: 231 started. Searching for the next optimal point.




0.49583234103357937 0.5507246376811594 0.6376811594202898
Iteration No: 231 ended. Search finished for the next optimal point.
Time taken: 8.1124
Function value obtained: -0.6377
Current minimum: -0.6377
Iteration No: 232 started. Searching for the next optimal point.




0.49499761791329205 0.5362318840579711 0.6376811594202898
Iteration No: 232 ended. Search finished for the next optimal point.
Time taken: 8.8078
Function value obtained: -0.6377
Current minimum: -0.6377
Iteration No: 233 started. Searching for the next optimal point.
0.49654515129854654 0.5652173913043478 0.6376811594202898
Iteration No: 233 ended. Search finished for the next optimal point.
Time taken: 8.3314
Function value obtained: -0.6377
Current minimum: -0.6377
Iteration No: 234 started. Searching for the next optimal point.
0.4931899641577061 0.5507246376811594 0.6086956521739131
Iteration No: 234 ended. Search finished for the next optimal point.
Time taken: 8.1734
Function value obtained: -0.6087
Current minimum: -0.6377
Iteration No: 235 started. Searching for the next optimal point.
0.4970273483947682 0.5507246376811594 0.6376811594202898
Iteration No: 235 ended. Search finished for the next optimal point.
Time taken: 8.9611
Function value obtained: -0.6377
Current minimum:



0.49583234103357937 0.5507246376811594 0.6376811594202898
Iteration No: 321 ended. Search finished for the next optimal point.
Time taken: 15.2144
Function value obtained: -0.6377
Current minimum: -0.6522
Iteration No: 322 started. Searching for the next optimal point.




0.49583234103357937 0.5507246376811594 0.6376811594202898
Iteration No: 322 ended. Search finished for the next optimal point.
Time taken: 14.2713
Function value obtained: -0.6377
Current minimum: -0.6522
Iteration No: 323 started. Searching for the next optimal point.
0.4914888515943419 0.5507246376811594 0.6521739130434783
Iteration No: 323 ended. Search finished for the next optimal point.
Time taken: 12.6870
Function value obtained: -0.6522
Current minimum: -0.6522
Iteration No: 324 started. Searching for the next optimal point.
0.4895508047081431 0.5652173913043478 0.6521739130434783
Iteration No: 324 ended. Search finished for the next optimal point.
Time taken: 11.9796
Function value obtained: -0.6522
Current minimum: -0.6522
Iteration No: 325 started. Searching for the next optimal point.
0.49005034763845606 0.5652173913043478 0.6521739130434783
Iteration No: 325 ended. Search finished for the next optimal point.
Time taken: 13.1134
Function value obtained: -0.6522
Current mini



0.49583234103357937 0.5507246376811594 0.6376811594202898
Iteration No: 375 ended. Search finished for the next optimal point.
Time taken: 17.8290
Function value obtained: -0.6377
Current minimum: -0.6667
Iteration No: 376 started. Searching for the next optimal point.
0.4894613583138174 0.4492753623188406 0.5507246376811594
Iteration No: 376 ended. Search finished for the next optimal point.
Time taken: 30.4304
Function value obtained: -0.5507
Current minimum: -0.6667
Iteration No: 377 started. Searching for the next optimal point.
0.4930721452460583 0.5652173913043478 0.6231884057971014
Iteration No: 377 ended. Search finished for the next optimal point.
Time taken: 17.6659
Function value obtained: -0.6232
Current minimum: -0.6667
Iteration No: 378 started. Searching for the next optimal point.




0.49583234103357937 0.5507246376811594 0.6376811594202898
Iteration No: 378 ended. Search finished for the next optimal point.
Time taken: 18.1224
Function value obtained: -0.6377
Current minimum: -0.6667
Iteration No: 379 started. Searching for the next optimal point.
0.4899328859060403 0.5362318840579711 0.5652173913043478
Iteration No: 379 ended. Search finished for the next optimal point.
Time taken: 20.5116
Function value obtained: -0.5652
Current minimum: -0.6667
Iteration No: 380 started. Searching for the next optimal point.
0.5044796691936596 0.4927536231884058 0.5797101449275363
Iteration No: 380 ended. Search finished for the next optimal point.
Time taken: 25.4282
Function value obtained: -0.5797
Current minimum: -0.6667
Iteration No: 381 started. Searching for the next optimal point.
0.614680398888344 0.43478260869565216 0.43478260869565216
Iteration No: 381 ended. Search finished for the next optimal point.
Time taken: 17.2804
Function value obtained: -0.4348
Current mini

In [57]:
joblib.dump(result, 'result.sav')

['result.sav']

In [58]:
best_param

[False, True, '11', True, None, False, 'balanced', 2.7506158292731357]

In [51]:
a=[len(i) for i in day_x_test]

In [53]:
[i for i in a if i<20]

[14, 13, 14, 9, 14, 8, 18, 18]