### Предобработка данных

**Импорт зависимостей**

In [1]:
import optuna
import pandas as pd
import numpy as np

from optuna.samplers import TPESampler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
from sklearn.preprocessing import OneHotEncoder

In [20]:
import pickle

**Как мы получаем `elon_musk_tweets_after_eda.csv`**

TODO.

**Что мы делаем с `elon_musk_tweets_after_eda.csv`**

In [2]:
def transform_date(df):
    df['date'] = pd.to_datetime(df['date'])
    return df.sort_values(by='date')

In [3]:
def df_fillna(df):
    cols_to_fill = ['user_location', 'user_description', 'emojis']
    df.loc[:, cols_to_fill] = df.loc[:, cols_to_fill].fillna('None')
    return df

In [4]:
def drop_useless_cols(df):
    useless_cols = [
        'lemmatized_pronouns', 
        'tweet', 
        'cleaned_tweet', 
        'tokens', 
        'cleaned_tokens',
        'word_count', 
        'char_count', 
        'emojis',
        'word_count_cleaned', 
        'spacy_lemmatized', 
        'pos_tags', 
        'lemmatized', 
        'email_count', 
        'hashtag_count',
        'month', 
        'year', 
        'hour'
    ]
    return df.drop(columns=useless_cols)

Читаем данные и делаем базовые преобразования:

In [6]:
df = pd.read_csv('lisa_files/elon_musk_tweets_after_eda.csv', encoding='utf-8')

In [7]:
df = transform_date(df)

In [8]:
df = df_fillna(df)

In [9]:
df = drop_useless_cols(df)

In [10]:
# df_encoded = pd.get_dummies(df, columns=['user_location', 'user_description'], drop_first=False)

# encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# encoder.fit(df.loc[:, ['user_location', 'user_description']])

Кодируем категориальные:

In [28]:
def encode_categoric(df):
    with open('encoder.pkl', 'rb') as f:
        encoder = pickle.load(f)

    return pd.DataFrame(
        data=encoder.transform(df.loc[:, ['user_location', 'user_description']]),  # Обучены именно на этих двух колонках
        index=df.index,
        columns=encoder.get_feature_names_out()
    )

In [29]:
df_encoded = pd.concat([df.drop(columns=['user_location', 'user_description']), encode_categoric(df)], axis=1)

Разбиваем на `train/val/test`:

In [31]:
train_cutoff = df_encoded['date'].quantile(0.6)  # Дата, отсекающая 60% наблюдений
val_cutoff = df_encoded['date'].quantile(0.8)

# Получаем три датафрейма
train_df = df_encoded[df_encoded['date'] <= train_cutoff]
val_df = df_encoded[(df_encoded['date'] > train_cutoff) & (df['date'] <= val_cutoff)]
test_df = df_encoded[df_encoded['date'] > val_cutoff]

In [32]:
X_train = train_df.drop(columns=['feeling', 'date'])
y_train = train_df['feeling']

X_val = val_df.drop(columns=['feeling', 'date'])
y_val = val_df['feeling']

X_test = test_df.drop(columns=['feeling', 'date'])
y_test = test_df['feeling']

Векторизуем твиты:

In [36]:
# vectorizer = CountVectorizer()
# vectorizer.fit(X_train['lemmatized_str'])

In [37]:
with open('vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

In [38]:
X_train_lemmatized = vectorizer.transform(X_train['lemmatized_str'])
X_val_lemmatized = vectorizer.transform(X_val['lemmatized_str'])
X_test_lemmatized = vectorizer.transform(X_test['lemmatized_str'])

Приводим все числовые фичи к единой шкале:

In [46]:
numerical_features = X_train.select_dtypes(include=[np.number]).columns.tolist()

In [47]:
# scaler = StandardScaler()
# scaler.fit(X_train[numerical_features])

In [48]:
with open('scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

In [49]:
# Масштабируем числовые признаки
X_train_numerical = scaler.transform(X_train[numerical_features])
X_val_numerical = scaler.transform(X_val[numerical_features])
X_test_numerical = scaler.transform(X_test[numerical_features])

In [50]:
X_train_combined = hstack([X_train_lemmatized, X_train_numerical])
X_val_combined = hstack([X_val_lemmatized, X_val_numerical])
X_test_combined = hstack([X_test_lemmatized, X_test_numerical])

**Сериализация для тестирования**

In [17]:
import pickle

In [18]:
with open('X.pkl', 'wb') as f:
    pickle.dump(X_train_combined, f)

In [19]:
with open('y.pkl', 'wb') as f:
    pickle.dump(y_train, f)

In [20]:
with open('dt.pkl', 'rb') as f:
    dt = pickle.load(f)

In [21]:
dt.predict(X_train_combined)

array([0, 0, 1, ..., 1, 0, 1])