In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
import itertools

import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
test = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv")
train = pd.read_csv("/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip")
test.head(3)

In [None]:
train

In [None]:
train.rename(columns={'comment_text':'text'}, inplace=True)

In [None]:
test.shape, train.shape

In [None]:
train.isna().sum()

In [None]:
train['toxic'].value_counts()

In [None]:
def identity_hate_to_cat(X):
    X['identity_hate_cat'] = 0
    X.loc[X['identity_hate'] == 1, 'identity_hate_cat'] = 0.2
    return X

def insult_to_cat(X):
    X['insult_cat'] = 0
    X.loc[X['insult'] == 1, 'insult_cat'] = 0.4
    return X

def threat_to_cat(X):
    X['threat_cat'] = 0
    X.loc[X['threat'] == 1, 'threat_cat'] = 0.6
    return X

def obscene_to_cat(X):
    X['obscene_cat'] = 0
    X.loc[X['obscene'] == 1, 'obscene_cat'] = 0.8
    return X

def severe_toxic_to_cat(X):
    X['severe_toxic_cat'] = 0
    X.loc[X['severe_toxic'] == 1, 'severe_toxic_cat'] = 1
    return X

def toxic_to_cat(X):
    X['toxic_cat'] = X['severe_toxic_cat'] + X['obscene_cat'] + X['threat_cat'] + X['insult_cat'] + X['identity_hate_cat']
    return X

In [None]:
identity_hate_to_cat(train)
insult_to_cat(train)
threat_to_cat(train)
obscene_to_cat(train)
severe_toxic_to_cat(train)
toxic_to_cat(train)
train

In [None]:
train['toxic_cat'].value_counts()

In [None]:
train['toxic'].value_counts()

In [None]:
x = train.drop('toxic', axis=1)
x

In [None]:
y = train['toxic']
y

In [None]:
#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

#соберем наш простой pipeline, но нам понадобится написать класс для выбора нужного поля
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]

model = Pipeline([('comment_text_selector', FeatureSelector(column='text')), 
                     ('comment_text_tfidf', TfidfVectorizer(sublinear_tf=True,
                                                            strip_accents='unicode',
                                                            analyzer='word',
                                                            token_pattern=r'\w{1,}',
                                                            stop_words='english',
                                                            ngram_range=(1, 1),
                                                            max_features=6000)), 
                     ('clf', LogisticRegression(C=0.1, solver='sag'))])

#запустим кросс-валидацию
cv_scores = cross_val_score(model, X_train, y_train, cv=3, scoring='roc_auc')
cv_score = np.mean(cv_scores)
print('CV score is {}'.format(cv_score))



**Обучение**

In [None]:
#обучим пайплайн на всем тренировочном датасете
model.fit(X_train, y_train)
y_score = model.predict_proba(X_test)[:, 1]

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, confusion_matrix

b=1

precision, recall, thresholds = precision_recall_curve(y_test.values, y_score)
fscore = (1+b**2)*(precision * recall) / (b**2*precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

In [None]:
import seaborn as sns
import itertools
import matplotlib.pyplot as plt

%matplotlib inline
sns.set(font_scale=1.5)
sns.set_color_codes("muted")

plt.figure(figsize=(10, 8))
# fpr, tpr, thresholds_ = roc_curve(y_test, y_score, pos_label=1) # при max_features=100
# fpr1, tpr1, thresholds_ = roc_curve(y_test, y_score, pos_label=1) # при max_features=1000
# fpr2, tpr2, thresholds_ = roc_curve(y_test, y_score, pos_label=1) # при max_features=2000
# fpr3, tpr3, thresholds_ = roc_curve(y_test, y_score, pos_label=1) # при max_features=4000
fpr4, tpr4, thresholds_ = roc_curve(y_test, y_score, pos_label=1) # при max_features=6000
lw = 2
# plt.plot(fpr, tpr, lw=lw,color = 'r', label='ROC curve max_features=100')
# plt.plot(fpr1, tpr1, lw=1,color = 'b', label='ROC curve max_features=1000')
# plt.plot(fpr2, tpr2, lw=1,color = 'g', label='ROC curve max_features=2000')
# plt.plot(fpr3, tpr3, lw=1,color = 'y', label='ROC curve max_features=4000')
plt.plot(fpr4, tpr4, lw=1,color = 'r', label='ROC curve max_features=6000')
plt.plot([0, 1], [0, 1])
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
plt.legend()
plt.savefig("ROC.png")
plt.show()

In [None]:
predicts = model.predict_proba(test)[:, 1]
predicts

In [None]:
submit=pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/sample_submission.csv', sep=',')
submit.head()

In [None]:
submit['score'] = predicts

In [None]:
submit.to_csv('submission.csv', index=False)