### Предобработка данных

**Импорт зависимостей**

In [1]:
import optuna
import pandas as pd
import numpy as np

from optuna.samplers import TPESampler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack

from sklearn.preprocessing import OneHotEncoder
from html import unescape
import pickle
import re

  from .autonotebook import tqdm as notebook_tqdm


**Как мы получаем `elon_musk_tweets_after_eda.csv`**

Открываем наш размеченный датасет:

In [3]:
df = pd.read_parquet('elon_musk_tweets_labeled.parquet').rename(columns={'feeling_auto': 'feeling'})

Все необходимые преобразования:

In [6]:
df = df.drop(columns=['id', 'user_created'])

In [7]:
df['date'] = pd.to_datetime(df['date'])

In [8]:
df = df.sort_values(by='date')

In [9]:
df = df.drop(['hashtags', 'user_name', 'is_retweet'], axis=1)

In [10]:
df = df.drop(columns=['source'])

In [11]:
df['hour'] = df['date'].dt.hour

In [12]:
df['tweets_per_hour'] = df['hour'].value_counts().sort_index()

In [13]:
df['date'] = df['date'].dt.tz_localize(None)

In [14]:
df['month_year'] = df['date'].dt.to_period('M')

In [15]:
df = df.drop(columns=['hour', 'tweets_per_hour', 'month_year'])

In [16]:
df = df.drop(columns=['user_verified'])

In [17]:
df.rename(columns={'text': 'tweet'}, inplace=True)

In [18]:
df['tweet'] = df['tweet'].apply(unescape)

In [19]:
df['tweet'] = df['tweet'].str.replace(r'#\b(1|3)\b', r'\1', regex=True)

In [20]:
#Функция для извлечения смайлов из текста
def extract_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"
        "\U0001F300-\U0001F5FF"
        "\U0001F680-\U0001F6FF"
        "\U0001F700-\U0001F77F"
        "\U0001F780-\U0001F7FF"
        "\U0001F800-\U0001F8FF"
        "\U0001F900-\U0001F9FF"
        "\U0001FA00-\U0001FAFF"
        "\U00002700-\U000027BF"
        "\U000024C2-\U0001F251"
        "]+"
    )
    return emoji_pattern.findall(text)


In [21]:
df['emojis'] = df['tweet'].apply(lambda x: ''.join(extract_emojis(x)))

In [22]:
#Функции для подсчета количества элементов в твите
def count_emails(tweet):
    return len(re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', tweet))

def count_nicknames(tweet):
    return len(re.findall(r'@([A-Za-z0-9_]{1,})', tweet))

def count_urls(tweet):
    return len(re.findall(r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})', tweet))

def count_hashtags(tweet):
    return len(re.findall(r'#\w+', tweet))

def count_emojis(tweet):
    return len(re.findall(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U00002700-\U000027BF]', tweet))

In [23]:
df['email_count'] = df['tweet'].apply(count_emails)
df['nickname_count'] = df['tweet'].apply(count_nicknames)
df['url_count'] = df['tweet'].apply(count_urls)
df['hashtag_count'] = df['tweet'].apply(count_hashtags)
df['emoji_count'] = df['tweet'].apply(count_emojis)

In [24]:
df['tweet'] = df['tweet'].str.replace(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', '', regex=True) #Удаляем emails
df['tweet'] = df['tweet'].str.replace(r'@([A-Za-z0-9_]{1,})', '', regex=True) #Удаляем ники
df['tweet'] = df['tweet'].str.replace(r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})', '', regex=True) #Удаляем urls

In [25]:
df['tweet'] = df['tweet'].str.lower()

In [26]:
df['cleaned_tweet'] = df['tweet'].str.replace(r'[^a-zA-Z\s]', '', regex=True) #Удаляем знаки препинания и цифры и оставляем только буквы и пробелы

In [27]:
df['cleaned_tweet'] = df['cleaned_tweet'].str.replace(r'\s+', ' ', regex=True).str.strip()  #Удаляем лишние пробелы

In [28]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [29]:
#nltk.download('punkt_tab')
#nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [30]:
df['tokens'] = df['cleaned_tweet'].apply(word_tokenize)

In [31]:
def delete_stopwords(tokens):
    cleaned = []
    for word in tokens:
        if word not in stop_words:
            cleaned.append(word)
    return cleaned

In [32]:
df['cleaned_tokens'] = df['tokens'].apply(delete_stopwords)

In [33]:
# nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

In [34]:
def lemmatize_tokens(tokens):
    lemmatized = []
    for word in tokens:
        lemmatized.append(lemmatizer.lemmatize(word))
    return lemmatized

In [35]:
df['lemmatized'] = df['cleaned_tokens'].apply(lemmatize_tokens)

In [41]:
# !python -m spacy download en_core_web_sm
import spacy

In [42]:
%%time

nlp = spacy.load(
    "en_core_web_sm",
    disable=['parser', 'senter', 'ner', 'lemmatizer'],
    enable=['tok2vec', 'tagger', 'attribute_ruler']
)
df['spacy_lemmatized'] = df.lemmatized.str.join(' ').apply(lambda x : nlp(x))

CPU times: user 4.36 s, sys: 84.5 ms, total: 4.44 s
Wall time: 4.52 s


In [43]:
%%time

def spacy_lemmatize_tokens(text):
    c = []
    doc = nlp(text)
    k = [(token.text, token.pos_) for token in doc]
    c.extend(k)
    return c

df['lemmatized_pronouns'] = df['spacy_lemmatized'].apply(spacy_lemmatize_tokens)

CPU times: user 4.04 s, sys: 141 ms, total: 4.18 s
Wall time: 4.26 s


In [45]:
df['pos_tags'] = df['lemmatized_pronouns']

In [52]:
df['._count'] = df['tweet'].str.count('\.')
df['!_count'] = df['tweet'].str.count('\!')
df['@_count'] = df['tweet'].str.count(r'\@')
df["'_count"] = df['tweet'].str.count('\'')
df[',_count'] = df['tweet'].str.count('\,')
df['/_count'] = df['tweet'].str.count('\/')
df['?_count'] = df['tweet'].str.count(r'\?')
df[';_count'] = df['tweet'].str.count(r'\;')
df['-_count'] = df['tweet'].str.count(r'\-')
df[')_count'] = df['tweet'].str.count(r'\)')
df['#_count'] = df['tweet'].str.count(r'\#')
df['(_count'] = df['tweet'].str.count(r'\(')

In [54]:
df['char_count'] = df['tweet'].str.len()

In [56]:
df['word_count'] = df['tweet'].str.split().str.len() #Количество слов в твитах

In [57]:
df['word_count_cleaned'] = df['cleaned_tweet'].str.split().str.len() #Количество слов в твитах

In [58]:
df['char_count_cleaned'] = df['cleaned_tweet'].str.len() #Количество символов в твитах

In [60]:
df.drop(index=df.loc[df.lemmatized.apply(lambda x : len(x)) == 0].index, inplace=True)

In [61]:
df['lemmatized_str'] = df['lemmatized'].apply(lambda x: ' '.join(x))

In [62]:
contraction_patterns = {
"gon na": "gonna",
"wan na": "wanna",
"do n't": "don't",
"ai n't": "ain't",
"got ta": "gotta"
}

# Создаем регулярное выражение на основе списка неформальных фраз
pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in contraction_patterns.keys()) + r')\b', re.IGNORECASE)

# Функция для замены неформальных фраз
def replace_contractions(text):
    # Проверяем, содержатся ли целевые фразы в тексте
    if any(key in text for key in contraction_patterns.keys()):
    # Заменяем каждую найденную фразу на нужное значение
        return pattern.sub(lambda x: contraction_patterns[x.group(0).lower()], text)
    return text

In [63]:
df['lemmatized_str'] = df['lemmatized_str'].apply(replace_contractions)

In [64]:
df['hour'] = df['date'].dt.hour

In [65]:
df['month'] = df['date'].dt.to_period('M')

In [66]:
df['month'] = df['month'].astype(str)

In [69]:
df[['year', 'month']] = df['month'].str.split('-', expand=True)

In [70]:
df['year'] = df['year'].astype(int)
df['month'] = df['month'].astype(int)

In [77]:
df['favourites'] = df['favorites']

Выгрузка:

In [78]:
needed_cols = ['user_location',
 'user_description',
 'user_friends',
 'date',
 'tweet',
 'favourites',
 'feeling',
 'month',
 'word_count',
 'char_count',
 '._count',
 '!_count',
 '@_count',
 "'_count",
 ',_count',
 '/_count',
 '?_count',
 ';_count',
 '-_count',
 ')_count',
 '#_count',
 '(_count',
 'emojis',
 'email_count',
 'nickname_count',
 'url_count',
 'hashtag_count',
 'emoji_count',
 'cleaned_tweet',
 'word_count_cleaned',
 'char_count_cleaned',
 'tokens',
 'cleaned_tokens',
 'lemmatized',
 'spacy_lemmatized',
 'lemmatized_pronouns',
 'pos_tags',
 'lemmatized_str',
 'hour',
 'year']

In [80]:
df.loc[:, needed_cols].to_csv('elon_musk_tweets_after_eda.csv')

**Что мы делаем с `elon_musk_tweets_after_eda.csv`**

In [83]:
def transform_date(df):
    df['date'] = pd.to_datetime(df['date'])
    return df.sort_values(by='date')

In [84]:
def df_fillna(df):
    cols_to_fill = ['user_location', 'user_description', 'emojis']
    df.loc[:, cols_to_fill] = df.loc[:, cols_to_fill].fillna('None')
    return df

In [85]:
def drop_useless_cols(df):
    useless_cols = [
        'lemmatized_pronouns', 
        'tweet', 
        'cleaned_tweet', 
        'tokens', 
        'cleaned_tokens',
        'word_count', 
        'char_count', 
        'emojis',
        'word_count_cleaned', 
        'spacy_lemmatized', 
        'pos_tags', 
        'lemmatized', 
        'email_count', 
        'hashtag_count',
        'month', 
        'year', 
        'hour'
    ]
    return df.drop(columns=useless_cols)

Читаем данные и делаем базовые преобразования:

In [111]:
df = pd.read_csv('elon_musk_tweets_after_eda.csv', encoding='utf-8').drop(columns='Unnamed: 0')

In [112]:
df = transform_date(df)

In [113]:
df = df_fillna(df)

In [114]:
df = drop_useless_cols(df)

In [115]:
# df_encoded = pd.get_dummies(df, columns=['user_location', 'user_description'], drop_first=False)

# encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# encoder.fit(df.loc[:, ['user_location', 'user_description']])

Кодируем категориальные:

In [116]:
def encode_categoric(df):
    with open('encoder.pkl', 'rb') as f:
        encoder = pickle.load(f)

    return pd.DataFrame(
        data=encoder.transform(df.loc[:, ['user_location', 'user_description']]),  # Обучены именно на этих двух колонках
        index=df.index,
        columns=encoder.get_feature_names_out()
    )

In [117]:
df_encoded = pd.concat([df.drop(columns=['user_location', 'user_description']), encode_categoric(df)], axis=1)

Разбиваем на `train/val/test`:

In [118]:
train_cutoff = df_encoded['date'].quantile(0.6)  # Дата, отсекающая 60% наблюдений
val_cutoff = df_encoded['date'].quantile(0.8)

# Получаем три датафрейма
train_df = df_encoded[df_encoded['date'] <= train_cutoff]
val_df = df_encoded[(df_encoded['date'] > train_cutoff) & (df['date'] <= val_cutoff)]
test_df = df_encoded[df_encoded['date'] > val_cutoff]

In [119]:
X_train = train_df.drop(columns=['feeling', 'date'])
y_train = train_df['feeling']

X_val = val_df.drop(columns=['feeling', 'date'])
y_val = val_df['feeling']

X_test = test_df.drop(columns=['feeling', 'date'])
y_test = test_df['feeling']

Векторизуем твиты:

In [120]:
# vectorizer = CountVectorizer()
# vectorizer.fit(X_train['lemmatized_str'])

In [121]:
with open('vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

In [122]:
X_train_lemmatized = vectorizer.transform(X_train['lemmatized_str'])
X_val_lemmatized = vectorizer.transform(X_val['lemmatized_str'])
X_test_lemmatized = vectorizer.transform(X_test['lemmatized_str'])

Приводим все числовые фичи к единой шкале:

In [123]:
numerical_features = X_train.select_dtypes(include=[np.number]).columns.tolist()

In [124]:
# scaler = StandardScaler()
# scaler.fit(X_train[numerical_features])

In [125]:
with open('scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

In [126]:
# Масштабируем числовые признаки
X_train_numerical = scaler.transform(X_train[numerical_features])
X_val_numerical = scaler.transform(X_val[numerical_features])
X_test_numerical = scaler.transform(X_test[numerical_features])

In [127]:
X_train_combined = hstack([X_train_lemmatized, X_train_numerical])
X_val_combined = hstack([X_val_lemmatized, X_val_numerical])
X_test_combined = hstack([X_test_lemmatized, X_test_numerical])