In [1]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.model_selection import KFold
from wordcloud import WordCloud
import joblib
import nltk
from nltk.corpus import stopwords
import spacy
import re
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import seaborn as sns
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from catboost import CatBoostRegressor

In [4]:
train= pd.read_csv('data/train.csv')
validation = pd.read_csv('data/validation.csv')
test = pd.read_csv('data/test.csv')

In [4]:
train.shape, validation.shape, test.shape

In [3]:
train = train.dropna(subset=['comment_text'])
train.head()

In [75]:
fig = plt.figure(figsize = (10, 5))
lang_counts = validation.groupby('lang').size().reset_index(name='count')
plt.bar(lang_counts['lang'], lang_counts['count'], color ='magenta', 
        width = 0.4)

plt.xlabel("Languages")
plt.ylabel("Number of comments")
plt.title("Comments on different language counts")
plt.show()

In [76]:
fig = plt.figure(figsize = (10, 5))
lang_counts = test.groupby('lang').size().reset_index(name='count')
plt.bar(lang_counts['lang'], lang_counts['count'], color ='magenta', 
        width = 0.4)
plt.xlabel("Languages")
plt.ylabel("Number of comments")
plt.title("Comments on different language counts")
plt.show()
print(train.head())

### Exploring train dataset

In [4]:
nlp = spacy.load("en_core_web_sm")
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^A-Za-z\s\u00C0-\u017F\u4e00-\u9fff]', '', text)
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.lemma_ not in stop_words and not token.is_punct and not token.is_space]
    return ' '.join(tokens)
train['cleaned_comment_text'] = train['comment_text'].apply(lambda x: preprocess_text(str(x)))
train[['comment_text', 'cleaned_comment_text']].head()


In [5]:
all_tokens = []
for comment in train['cleaned_comment_text']:
    all_tokens.extend(comment.split(' '))  
unique_tokens = set(all_tokens)
token_dict = {token: index for index, token in enumerate(unique_tokens)}
print(dict(list(token_dict.items())[:10]))

In [84]:
train.to_csv('data/train.csv', index=False)

In [90]:
columns_of_interest = ['toxic', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']
correlation_matrix = train[columns_of_interest].corr()
print(correlation_matrix)
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Toxicity Parameters')
plt.show()

### TF-IDF + XGBRegressor

In [6]:

vectorizer = TfidfVectorizer(vocabulary=token_dict,max_features=10000,min_df=2)
svd = TruncatedSVD(n_components=500)  # Установите количество компонент

tfidf_matrix = vectorizer.fit_transform(train['cleaned_comment_text'])
tfidf_df = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix, columns=vectorizer.get_feature_names_out())
tfidf_reduced = svd.fit_transform(tfidf_matrix)

In [7]:

X_array = tfidf_reduced
y_array = train['toxic'].values

X_train, X_test, y_train, y_test = train_test_split(X_array, y_array, test_size=0.2, random_state=42)

In [8]:
model = xgb.XGBRegressor(n_estimators=70, max_depth=25, 
                          tree_method='hist', device='cuda', 
                          random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
scores = cross_val_score(model, X_array, y_array, cv=5, scoring='neg_mean_squared_error')
mean_mse_cv = -scores.mean() 
print(f'Mean Squared Error (Test Set): {mse}')
print(f'R² Score (Test Set): {r2}')
print(f'Mean Squared Error from Cross-Validation: {mean_mse_cv}')


In [12]:


joblib.dump(model, 'xgb_model.pkl')

In [120]:


model = xgb.XGBRegressor(n_estimators=100, max_depth=50, 
                          tree_method='hist', device='cuda', 
                          random_state=42)
scores = cross_val_score(model, X_array, y_array, cv=5, scoring='neg_mean_squared_error')

mean_mse = -scores.mean()
print(f'Mean Squared Error from Cross-Validation: {mean_mse}')

Основные метрики для регрессии:

Mean Squared Error (MSE): Средняя квадратическая ошибка, которая показывает, насколько сильно предсказанные значения отличаются от фактических.

R² (коэффициент детерминации): Показывает, какая доля дисперсии в данных объясняется моделью. Чем ближе к 1, тем лучше модель объясняет зависимость данных.

In [13]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
scores = cross_val_score(model, X_array, y_array, cv=5, scoring='neg_mean_squared_error')
mean_mse_cv = -scores.mean()

# Print the metrics
print(f'Mean Squared Error (Test Set): {mse}')
print(f'R² Score (Test Set): {r2}')
print(f'Mean Squared Error from Cross-Validation: {mean_mse_cv}')

# Plotting true vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2, color='red')
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('True vs Predicted Values (XGBoost)')
plt.grid(True)
plt.show()

# Residuals plot
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True, color='blue', bins=30)
plt.axvline(x=0, color='red', linestyle='--')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Distribution of Residuals (XGBoost)')
plt.grid(True)
plt.show()

# Feature importance plot (XGBoost)
plt.figure(figsize=(10, 6))
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
plt.bar(range(len(importances)), importances[indices], align='center')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importance (XGBoost)')
plt.show()

### Doc2Vec и CatBoost

In [None]:

train_tagged = [TaggedDocument(words=row.split(), tags=[str(i)]) for i, row in enumerate(train['cleaned_comment_text'])]

# Обучаем модель Doc2Vec
doc2vec_model = Doc2Vec(vector_size=100, window=2, min_count=1, workers=4, epochs=40)
doc2vec_model.build_vocab(train_tagged)
doc2vec_model.train(train_tagged, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

# Преобразуем тексты в векторы
X_doc2vec = [doc2vec_model.infer_vector(row.split()) for row in train['cleaned_comment_text']]
X_doc2vec = pd.DataFrame(X_doc2vec)

# Обучение CatBoost
X_train, X_test, y_train, y_test = train_test_split(X_doc2vec, train['toxic'], test_size=0.2, random_state=42)

# Определяем модель CatBoost с использованием GPU
catboost_model = CatBoostRegressor(iterations=1000, depth=10, learning_rate=0.1, 
                                    loss_function='RMSE', devices='0', verbose=100)

# Обучаем модель
catboost_model.fit(X_train, y_train)

# Предсказываем и оцениваем модель
y_pred = catboost_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error with CatBoost: {mse}')

In [17]:
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
pipeline = joblib.load('catboost_pipeline_with_doc2vec.pkl')
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
scores = cross_val_score(pipeline, X_test, y_test, cv=5, scoring='neg_mean_squared_error')
mean_mse_cv = -scores.mean()

print(f'Mean Squared Error (Test Set): {mse}')
print(f'R² Score (Test Set): {r2}')
print(f'Mean Squared Error from Cross-Validation: {mean_mse_cv}')

# Построение графика зависимости истинных и предсказанных значений
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2, color='red')
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('True vs Predicted Values (CatBoost)')
plt.grid(True)
plt.show()

# Построение графика остатков (ошибок)
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True, color='blue', bins=30)
plt.axvline(x=0, color='red', linestyle='--')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Distribution of Residuals (CatBoost)')
plt.grid(True)
plt.show()

# Построение графика важности признаков (CatBoost)
plt.figure(figsize=(10, 6))
importances = pipeline.named_steps['catboost'].feature_importances_
indices = np.argsort(importances)[::-1]
plt.bar(range(len(importances)), importances[indices], align='center')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importance (CatBoost)')
plt.show()

In [2]:
train= pd.read_csv('data/train.csv')
validation = pd.read_csv('data/validation.csv')
test = pd.read_csv('data/test.csv')
train_sample = train.sample(n=10000, random_state=42)
train_df = train_sample[['cleaned_comment_text']].copy()
train_df['lang'] = 'en'
train_df.rename(columns={'cleaned_comment_text': 'content'}, inplace=True)
test_df = test[['content', 'lang']].copy()
combined_df = pd.concat([train_df, test_df], ignore_index=True)
combined_df = combined_df.dropna()
combined_df.to_csv('data/combined.csv')

## Lang detection

In [9]:
from classes.Doc2VecTransformer import Doc2VecTransformer
combined_df = pd.read_csv('data/combined.csv')
doc2vec_model = Doc2VecTransformer(device='cpu',vector_size=5000,trainings=False,epochs = 20,train_df=combined_df,model_path='models/doc2vec/doc2vec.model')

vectors = doc2vec_model.transform_to_vectors(combined_df['content'])
combined_df["vectors"] = [row for row in vectors]
unique_langs = combined_df['lang'].unique()
lang_to_id = {lang: idx for idx, lang in enumerate(unique_langs)}
combined_df['lang_id'] = combined_df['lang'].map(lang_to_id)

In [3]:
fig = plt.figure(figsize = (10, 5))
lang_counts = combined_df.groupby('lang').size().reset_index(name='count')
plt.bar(lang_counts['lang'], lang_counts['count'], color ='magenta', 
        width = 0.4)

plt.xlabel("Languages")
plt.ylabel("Number of comments")
plt.title("Comments on different language counts")
plt.show()

In [7]:
print(lang_counts['count'].mean)

Logistic regression

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
import numpy as np
from classes.Doc2VecTransformer import Doc2VecTransformer
from sklearn.metrics import confusion_matrix
from torch.utils.data import DataLoader, TensorDataset

class LogisticRegressionTorch(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LogisticRegressionTorch, self).__init__()
        self.linear = nn.Linear(input_size, num_classes)
    
    def forward(self, x):
        return self.linear(x)
def sigmoid(z):
    return 1 / (1 + torch.exp(-z))

X = np.array(combined_df['vectors'].tolist())
y = np.array(combined_df['lang_id'])
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.long)  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)  # Ожидается (n_samples, n_features)
print(y_train.shape)
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
batch_size = 32
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X_train, X_test = X_train.to(device), X_test.to(device)
y_train, y_test = y_train.to(device), y_test.to(device)
input_size = X_train.shape[1]
num_classes = len(unique_langs)

In [11]:
model = LogisticRegressionTorch(input_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
num_epochs = 100
losses = []
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0  
    num_batches = 0  

    for batch_X, batch_y in train_loader:
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()  
        num_batches += 1 
    average_loss = total_loss / num_batches
    losses.append(average_loss)
    if (epoch) % 10 == 0:
        print(f'Epoch [{epoch}/{num_epochs}], Loss: {average_loss:.4f}')
plt.figure(figsize=(10, 5))
plt.plot(range(num_epochs), losses, label='Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss Over Epochs')
plt.legend()
plt.grid()
plt.show()
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    all_predicted = [] 
    all_true = []    

    for batch_X, batch_y in test_loader:
        test_outputs = model(batch_X)
        _, predicted = torch.max(test_outputs, 1)
        
        all_predicted.extend(predicted.cpu().numpy()) 
        all_true.extend(batch_y.cpu().numpy())       

        correct += (predicted == batch_y).sum().item()
        total += batch_y.size(0)
    accuracy = correct / total
    
    print(f'Test Accuracy: {accuracy:.4f}')
predicted_np = np.array(all_predicted)
y_test_np = np.array(all_true)

conf_matrix = confusion_matrix(y_test_np, predicted_np)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_test_np), yticklabels=np.unique(y_test_np))
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [12]:
import os


def save_model(model, model_path):
    # Ensure the directory exists
    dir_name = os.path.dirname(model_path)
    os.makedirs(dir_name, exist_ok=True)
    
    try:
        # Save the model's state dictionary
        torch.save(model.state_dict(), model_path)
        print(f"Model saved to {model_path}")
    except PermissionError:
        raise RuntimeError(f"Permission denied: Unable to save the model to {model_path}.")
    except Exception as e:
        raise RuntimeError(f"Failed to save the model: {str(e)}")

save_model(model, 'models/lang_detect/logistic_regression_torch.pth')

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from toxic_coms_task.classes.Doc2VecTransformer import Doc2VecTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import pandas as pd

# Load and preprocess data
doc2vec_model = Doc2VecTransformer(model_path='models/doc2vec/doc2vec_transformer.pkl')
combined_df = combined_df.dropna()
vectors = doc2vec_model.transform(combined_df['content'])
combined_df["vectors"] = [row for row in vectors]
unique_langs = combined_df['lang'].unique()
lang_to_id = {lang: idx + 1 for idx, lang in enumerate(unique_langs)}
combined_df['lang_id'] = combined_df['lang'].map(lang_to_id)
X_train, X_test, y_train, y_test = train_test_split(combined_df['vectors'], combined_df['lang_id'], test_size=0.2, random_state=42)
smote = SMOTE(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestClassifier(class_weight='balanced', random_state=42)
scoring = make_scorer(roc_auc_score, needs_proba=True, multi_class='ovr')

grid = GridSearchCV(estimator=rf, param_grid=param_grid, scoring=scoring, cv=5, verbose=1, n_jobs=-1)
grid.fit(X_resampled, y_resampled)
predictions = grid.best_estimator_.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
roc_auc = roc_auc_score(y_test, grid.best_estimator_.predict_proba(X_test), multi_class='ovr')

print('Best Hyperparameters:', grid.best_params_)
print('Accuracy:', accuracy)

Try to detect in validation data(only es,it,tr)

In [4]:
from sklearn.metrics import classification_report

new_tfidf_matrix = doc2vec_model.transform(validation['comment_text'])
validation['lang_id'] = validation['lang'].map(lang_to_id)
new_predictions = model.predict(new_tfidf_matrix)

accuracy = accuracy_score(validation['lang_id'], new_predictions)
print('Accuracy:', accuracy)
print(classification_report(validation['lang_id'], new_predictions))

In [28]:
joblib.dump(model, 'language_detection.pkl')

In [None]:
from transformers import MarianMTModel, MarianTokenizer

# Словарь моделей для перевода с разных языков на английский
lang_to_model = {
    "ru": "Helsinki-NLP/opus-mt-ru-en",  # Модель для перевода с русского на английский
    "es": "Helsinki-NLP/opus-mt-es-en",  # Модель для перевода с испанского на английский
    "fr": "Helsinki-NLP/opus-mt-fr-en",  # Модель для перевода с французского на английский
    # Добавьте другие языки по необходимости
}

def translate_text(text, src_lang):
    # Определяем модель перевода по языку источника
    model_name = lang_to_model.get(src_lang, None)
    
    if model_name is None:
        raise ValueError(f"No translation model found for language: {src_lang}")
    
    # Загружаем модель и токенизатор для выбранного языка
    model = MarianMTModel.from_pretrained(model_name)
    tokenizer = MarianTokenizer.from_pretrained(model_name)

    # Токенизация текста и получение перевода
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    translated = model.generate(**inputs)
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)

    return translated_text

# Пример использования:
text = "Привет, как дела?"
src_lang = "ru"  # Язык источника
translated_text = translate_text(text, src_lang)
print(f"Перевод: {translated_text}")