In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from sklearn.linear_model import LogisticRegression


def prediction(val, test):
    
    val_ = val.drop(['title','subcategory','category','price','region','city','datetime_submitted'], axis = 1)
    test_ = test.drop(['title','subcategory','category','price','region','city','datetime_submitted'], axis = 1)

    otziv = []
    for i in val_['description']:
        clear = re.sub(r'[^а-яА-Я0-9 ]', ' ', i)
        otziv.append(' '.join(clear.split()))
    val_['lemm_text'] = otziv
    
    otziv_t = []
    for i in test_['description']:
        clear = re.sub(r'[^а-яА-Я0-9 ]', ' ', i)
        otziv_t.append(' '.join(clear.split()))
    test_['lemm_text'] = otziv_t

    train, val = train_test_split(val_, stratify = val_['is_bad'], test_size = 0.20, random_state = 12345)

    train['lemm_text'] = train['lemm_text'].values.astype('U')
    val['lemm_text'] = val['lemm_text'].values.astype('U')

    count_tf_idf = TfidfVectorizer()
    tf_idf_train = count_tf_idf.fit(train['lemm_text'])
    tf_idf_train = count_tf_idf.transform(train['lemm_text'])
    tf_idf_val = count_tf_idf.transform(val['lemm_text'])
    tf_idf_test = count_tf_idf.transform(test_['lemm_text'])

    parameters = {'C': np.linspace(0.05, 20, 60), 'class_weight': ['balanced', None]}
    grid_search = GridSearchCV(estimator = LogisticRegression(), param_grid = parameters, n_jobs = -1, cv = 5)
    grid_search.fit(tf_idf_train, train['is_bad'])

    model_linear = LogisticRegression(**grid_search.best_params_, random_state = 123)
    
    
    final = val_['lemm_text'].values.astype('U')
    tf_idf_final = count_tf_idf.transform(final)
    
    
    model_linear.fit(tf_idf_final,val_['is_bad'])
    pred_test = model_linear.predict_proba(tf_idf_test)[:, 1]
    
    return pred_test