In [None]:
'''!pip install pymorphy2
!pip install pyspellchecker'''
import os, pandas as pd, re, numpy as np, ast, nltk, math #,pymorphy2
from nltk.tokenize import word_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import PorterStemmer
#from spellchecker import SpellChecker
from catboost import Pool, CatBoostRegressor, CatBoostClassifier
from sklearn.metrics import mean_absolute_error
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
class Preprocessing:
    def __init__(self, stopwords):
        self.rgc = re.compile('[^a-zа-яё0-9-_]')
        self.tokenizer = ToktokTokenizer()
        #self.lemmatizer = pymorphy2.MorphAnalyzer()
        self.stemmer = PorterStemmer()

        with open(stopwords, 'r') as f:
            self.stopwords = set(f.read().split('\n'))

    def preproc(self, text, check_stopwords=True, check_length=True, use_lemm=False, use_stem=True):
        s = re.sub("\n", r" ", text)
        
        s = re.sub(r'\w*\d\w*', '', s).strip() #remove digits
        s = re.sub(' +', ' ', s) # join
        s = re.sub(r'(.)\1+', r'\1\1', s) # remove repetitions
        
        s = re.sub("'", r" ", s)
        s = s.lower()
        s = self.rgc.sub(" ", s)

        final_agg = []
        tf = {}

        for i, token in enumerate(self.tokenizer.tokenize(s)):
            if check_length and len(token) < 2:
                continue
            if token[-1] == '-' or token[0] == '-':
                continue
            if use_lemm:
                token = self.lemmatizer.parse(token)[0].normal_form
            if use_stem:
                token = self.stemmer.stem(token)
            if token not in self.stopwords or not check_stopwords:
                if token not in tf:
                    tf[token] = 0
                tf[token] += 1
                final_agg.append(token)

        return ' '.join(final_agg), tf
    
def prepare_df(df):
    df['len_comment'] = df.text.apply(lambda t: len(t))
    df['comment_text_proc'] = df['text'].apply(p.preproc)
    df = df.drop(columns=['text'])
    
    df['com_tf'] = pd.DataFrame(df['comment_text_proc'].tolist(), index=df.index)[1] 
    def right_tf(dic):
        if sum(list(dic.values())) != 0:
            right_tf = list(np.array(list(dic.values())) * ( 1 / sum(list(dic.values()))))
        else:
            right_tf = list(np.array(list(dic.values())) * 0)
        keys = list(dic.keys())
        dic = dict(zip(keys, right_tf))
        return dic
    df['com_tf'] = df['com_tf'].apply(right_tf)
    
    df.drop(columns=['comment_text_proc'])
    df = df.reset_index()
    
    return df
    
def read_proc(path_read='../input/jigsaw-toxic-severity-rating/comments_to_score.csv'):
    train = pd.read_csv(path_read)
    train = train[['comment_id', 'text']]
    return prepare_df(train)

def get_freq_dict(freq_string):
    return ast.literal_eval(freq_string[freq_string.find(',')+2:-1])

def get_comment(freq_string):
    return freq_string[2:freq_string.find('{') - 3]

def final_proc(train):
    train['freq_dict'] = train.comment_text_proc.apply(lambda t: t[1])
    train['comment'] = train.comment_text_proc.apply(lambda t: t[0])
    train = train.drop(columns=['comment_text_proc'])
    return train

def reg_df_prep(df, most_common_words):
    all_tf_dicts = [df['freq_dict'][i] for i in range(len(df))]
    
    def get_word_freq(word):
        freq = 0
        for i in range(len(df)):
            if word in all_tf_dicts[i].keys():
                freq += 1
        return freq
    
    i = 0 #
    for word in most_common_words:
        df[word] = np.array([d.get(word) for d in df['com_tf']])
        df[word] = df[word].fillna(0)
        if get_word_freq(word) == 0:
            df[word] = [0] * len(df)
            print(word)
        else:   
            df[word] = math.log10(df.shape[0]/get_word_freq(word)) * df[word]
        
        i += 1
        if i%100 == 0:
            print(i) #
        
    df = df.drop(columns=['com_tf', 'freq_dict', 'index'])
    return df

def reg_df_prep(df, most_common_words):
    all_tf_dicts = [df['freq_dict'][i] for i in range(len(df))]
    
    def get_word_freq(word):
        freq = 0
        for i in range(len(df)):
            if word in all_tf_dicts[i].keys():
                freq += 1
        return freq
    
    i = 0 #
    for word in most_common_words:
        df[word] = np.array([d.get(word) for d in df['com_tf']])
        df[word] = df[word].fillna(0)
        if get_word_freq(word) == 0:
            df[word] = [0] * len(df)
            print(word)
        else:   
            df[word] = math.log10(df.shape[0]/get_word_freq(word)) * df[word]
        
        i += 1
        if i%100 == 0:
            print(i) #
        
    df = df.drop(columns=['com_tf', 'freq_dict', 'index'])
    return df
  
p = Preprocessing(stopwords='../input/stoppp/stopwords.txt')

In [None]:
with open('../input/most-common-words/mc_1.txt', 'r') as f:
    mc_1 = set(f.read().split())
with open('../input/most-common-words/mc_2.txt', 'r') as f:
    mc_2 = set(f.read().split())

In [None]:
model_1 = CatBoostRegressor().load_model('../input/model-1/model_1')
model_2 = CatBoostRegressor().load_model('../input/model2/model_2')

In [None]:
cts = read_proc()
cts = final_proc(cts)

cts_1 = reg_df_prep(cts.copy(), mc_1)
cts_df_1 = cts_1.drop(columns=['comment_id', 'comment']).to_numpy()

In [None]:
cts_preds_1 = model_1.predict(cts_df_1)
#cts_preds_1 = np.clip(cts_preds_1, 0, 1)

In [None]:
cts_2 = reg_df_prep(cts.copy(), mc_2)
cts_df_2 = cts_2.drop(columns=['comment_id', 'comment']).to_numpy()

In [None]:
cts_preds_2 = model_2.predict(cts_df_2)
#cts_preds_2 = np.clip(cts_preds_2, 0, 1)

In [None]:
cts_preds_1

In [None]:
cts_preds_2

In [None]:
def normalize_arr(arr):
    return (arr - np.min(arr)) / (np.max(arr) - np.min(arr))

In [None]:
cts_preds_1 = normalize_arr(cts_preds_1)

In [None]:
cts_preds_2 = normalize_arr(cts_preds_2)

In [None]:
cts_preds_1.min()

In [None]:
preds = (cts_preds_2 + cts_preds_1)/2

In [None]:
preds.max()

In [None]:
df = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

In [None]:
#df = df[['comment_id']]
df

In [None]:
df['score'] = preds
df

In [None]:
df.sort_values(by=['score'], ascending = False).text.iloc[10]

In [None]:
len(df.score.unique())

In [None]:
df['score'] = df['score'].rank(method='first')

In [None]:
len(df.score.unique())

In [None]:
df.to_csv("submission.csv", index=False)

In [None]:
df