In [10]:
import pickle
import pandas as pd

In [2]:
df = pd.read_csv("datasets/dataset.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600 entries, 0 to 3599
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         3600 non-null   int64  
 1   pr_txt             3600 non-null   object 
 2   Категория          3600 non-null   object 
 3   Уровень рейтинга   3600 non-null   object 
 4   preprocessed_text  3600 non-null   object 
 5   target             3600 non-null   int64  
 6   scaled_target      3600 non-null   float64
dtypes: float64(1), int64(2), object(4)
memory usage: 197.0+ KB


In [4]:
df = df.drop(columns='Unnamed: 0')

## TF-IDF

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df['preprocessed_text'], 
                                                    df[['Категория', 'Уровень рейтинга', 'scaled_target']], 
                                                    test_size=0.3, random_state=42)

In [7]:
vec = TfidfVectorizer(ngram_range=(3, 3))

In [8]:
X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)

In [9]:
X_train_vec.shape

(2520, 556776)

In [12]:
 with open(f'models/TF-IDF 3 n-grams.pkl', 'wb') as f:
            pickle.dump(vec, f)
            print(f'saved')

saved


## Rergression2Classification

In [13]:
from sklearn.linear_model import LinearRegression

In [14]:
model = LinearRegression()
model.fit(X_train_vec, y_train['scaled_target'])

LinearRegression()

In [15]:
pred = model.predict(X_test_vec)

#### Regerssion metrics

In [16]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [18]:
print(f"MAE {mean_absolute_error(y_test['scaled_target'], pred)}")
print(f"RMSE {mean_squared_error(y_test['scaled_target'], pred)**0.5}")

MAE 0.026798834349338326
RMSE 0.03960379997045509


#### Dict's manipulation for classes

In [19]:
new_target_values = {'AAA': 9,
                     'AA+': 7, 
                     'AA': 6,
                     'AA-': 5,
                     'A+': 4,
                     'A': 3,
                     'A-': 2,
                     'BBB+': 1,
                     'BBB': 0,
                     'BBB-': -1,
                     'BB+': -2,
                     'BB': -3,
                     'BB-': -4,
                     'B+': -5,
                     'B': -6,
                     'B-': -7,
                     'C': -9}

In [20]:
scaled_target_values = {-10: 0.0,
                         -9: 0.04999999999999999,
                         -8: 0.09999999999999998,
                         -7: 0.14999999999999997,
                         -6: 0.19999999999999996,
                         -5: 0.25,
                         -4: 0.3,
                         -3: 0.35,
                         -2: 0.4,
                         -1: 0.45,
                         0: 0.5,
                         1: 0.55,
                         2: 0.6,
                         3: 0.65,
                         4: 0.7,
                         5: 0.75,
                         6: 0.8,
                         7: 0.8500000000000001,
                         8: 0.9,
                         9: 0.95,
                         10: 1.0}

In [21]:
new_values = ['C', 'C'] + list(new_target_values.keys())[::-1] + ['AAA', 'AAA']

In [22]:
scaled_target_values = dict(zip(scaled_target_values.values(), new_values))
scaled_target_values

{0.0: 'C',
 0.04999999999999999: 'C',
 0.09999999999999998: 'C',
 0.14999999999999997: 'B-',
 0.19999999999999996: 'B',
 0.25: 'B+',
 0.3: 'BB-',
 0.35: 'BB',
 0.4: 'BB+',
 0.45: 'BBB-',
 0.5: 'BBB',
 0.55: 'BBB+',
 0.6: 'A-',
 0.65: 'A',
 0.7: 'A+',
 0.75: 'AA-',
 0.8: 'AA',
 0.8500000000000001: 'AA+',
 0.9: 'AAA',
 0.95: 'AAA',
 1.0: 'AAA'}

#### Return classes

In [24]:
def return_rating_levels(value, error):
    for k, v in scaled_target_values.items():
        if abs(value - k) < error:
            return v

In [25]:
def return_category_classes(value):
    return value.strip('+-')

In [26]:
predicted_rating_levels = [return_rating_levels(i, 0.025) for i in pred]
predicted_category_classes = [return_category_classes(i) for i in predicted_rating_levels]

#### Classification metrics

In [27]:
from sklearn.metrics import f1_score

In [28]:
print(f"f1-micro level: {f1_score(y_test['Уровень рейтинга'], predicted_rating_levels, average='micro')}")
print(f"f1-micro category: {f1_score(y_test['Категория'], predicted_category_classes, average='micro')}")

f1-micro level: 0.6712962962962963
f1-micro category: 0.8814814814814815


In [29]:
 with open(f'models/LinReg.pkl', 'wb') as f:
            pickle.dump(model, f)
            print(f'saved')

saved
