In [1]:
# !wget 'https://drive.google.com/uc?export=download&id=1oPtTtVbkSEdiNwjcSHeEkv_C9g0yfLVO' -O train.csv

--2022-04-20 14:20:26--  https://drive.google.com/uc?export=download&id=1oPtTtVbkSEdiNwjcSHeEkv_C9g0yfLVO
Resolving drive.google.com (drive.google.com)... 142.250.152.139, 142.250.152.138, 142.250.152.100, ...
Connecting to drive.google.com (drive.google.com)|142.250.152.139|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-04-c0-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/svqfplkoe4c6sebkelvq8g4m2r8ihrd6/1650464400000/14904333240138417226/*/1oPtTtVbkSEdiNwjcSHeEkv_C9g0yfLVO?e=download [following]
--2022-04-20 14:20:29--  https://doc-04-c0-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/svqfplkoe4c6sebkelvq8g4m2r8ihrd6/1650464400000/14904333240138417226/*/1oPtTtVbkSEdiNwjcSHeEkv_C9g0yfLVO?e=download
Resolving doc-04-c0-docs.googleusercontent.com (doc-04-c0-docs.googleusercontent.com)... 142.250.152.132, 2607:f8b0:4001:c56::84
Connecting to doc-04-c0-docs.googleusercontent.com (doc-04-c0

## Step_1 Обучаем модель

In [29]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from scipy.sparse import hstack
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('train.csv').fillna(' ').sample(frac=1)
train.head(3)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
92475,f744b95831a09bcd,"""\n\nPeg Entwistle Suicide\n\nIn the article i...",0,0,0,0,0,0
151334,7b6ceddc66f8eb9f,Have you got a reliable source ? Bear in mind...,0,0,0,0,0,0
78832,d2f0c46908a9a910,Good call! The statements would have to be pre...,0,0,0,0,0,0


Здесь много разных вариантов, но мы сведем все к бинарному классу - toxic/nontoxic

In [30]:
train['y'] = train[class_names].max(axis=1).values
train['y'].value_counts()

0    143346
1     16225
Name: y, dtype: int64

Сделаем тренировочную и тестовую выборки

In [31]:
# разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(train.drop(columns='y'), 
                                                    train['y'], test_size=0.33, random_state=0)

# save test

X_test.to_csv("X_test.csv", index=None)

y_test.to_csv("y_test.csv", index=None)

# save train

X_train.to_csv("X_train.csv", index=None)

y_train.to_csv("y_train.csv", index=None)

Посмотрим на распределение классов в выборках

In [32]:
y_train.value_counts(normalize=True)

0    0.898103
1    0.101897
Name: y, dtype: float64

In [33]:
y_test.value_counts(normalize=True)

0    0.898764
1    0.101236
Name: y, dtype: float64

Построим простое векторное представление текстов и обучим логистическую регрессию


In [34]:
%%time
# соберем наш простой pipeline, но нам понадобится написать класс для выбора нужного поля
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]

classifier = Pipeline([('comment_text', FeatureSelector(column='comment_text')), 
                       ('comment_text_tfidf', TfidfVectorizer(strip_accents='unicode',
                                                              analyzer='word',
                                                              token_pattern=r'\w{1,}',
                                                              stop_words='english',
                                                              max_features=10000)), 
                       ('clf', LogisticRegression(C=0.1))])

# запустим кросс-валидацию
cv_scores = cross_val_score(classifier, X_train, y_train, cv=3, scoring='roc_auc')
cv_score = np.mean(cv_scores)
print(f'CV score is {cv_score}')

# обучим пайплайн на всем тренировочном датасете
classifier.fit(X_train, y_train)
y_score = classifier.predict_proba(X_test)[:, 1]

CV score is 0.9535864064597713
CPU times: user 37 s, sys: 4.26 s, total: 41.3 s
Wall time: 36.6 s


In [35]:
classifier.steps

[('comment_text', FeatureSelector(column='comment_text')),
 ('comment_text_tfidf',
  TfidfVectorizer(max_features=10000, stop_words='english',
                  strip_accents='unicode', token_pattern='\\w{1,}')),
 ('clf', LogisticRegression(C=0.1))]

Сохраним модель (пайплайн)

In [36]:
import dill
with open("logreg_pipeline.dill", "wb") as f:
    dill.dump(classifier, f)

# Step 2 - PREDICT

### Проверка работоспособности и качества пайплайна

Здесь мы еще не запускаем никакое API, а загружаем модель (pipeline) напрямую и проверяем на отложенной (тестовой) выборке

In [37]:
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")

In [38]:
X_test.head(3)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,075e64b2bd0ff46f,"""\n\nFlora: Yes...but I think I can find a way...",0,0,0,0,0,0
1,3b20462c2f25635d,Regarding Your Question\nRegarding your questi...,0,0,0,0,0,0
2,77a723e8d6a1de64,Fuck you you dumb sack of shit,1,1,1,0,1,0


In [39]:
with open('logreg_pipeline.dill', 'rb') as in_strm:
    pipeline = dill.load(in_strm)

In [40]:
pipeline

Pipeline(steps=[('comment_text', FeatureSelector(column='comment_text')),
                ('comment_text_tfidf',
                 TfidfVectorizer(max_features=10000, stop_words='english',
                                 strip_accents='unicode',
                                 token_pattern='\\w{1,}')),
                ('clf', LogisticRegression(C=0.1))])

In [41]:
preds = pipeline.predict_proba(X_test)[:, 1]

pred_df = pd.DataFrame({'preds': preds})
pred_df.to_csv("test_predictions.csv", index=None)

In [42]:
preds[:10]

array([0.10218604, 0.0374502 , 0.99745876, 0.11436142, 0.98736012,
       0.02526285, 0.07090281, 0.26632446, 0.2491152 , 0.0284733 ])

In [43]:
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.metrics import f1_score
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.19427859889924132, F-Score=0.742, Precision=0.798, Recall=0.694


# Step 3 - FLASK

In [44]:
# !pip install flask-ngrok

In [45]:
from flask_ngrok import run_with_ngrok
from flask import Flask, request, jsonify
import pandas as pd

https://dashboard.ngrok.com/get-started/setup

In [46]:
# !wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.tgz
# !tar -xvf /content/ngrok-stable-linux-amd64.tgz
# !./ngrok authtoken 2848kZxIlkk9m2f6YO2FnhcdPuN_4tHZxtKyiLt3wiwBiBuMh
# !./ngrok http 80

In [47]:
# # Пробный запуск Flask

# app = Flask(__name__)
# run_with_ngrok(app)  # Start ngrok when app is run

# @app.route("/a")
# def hello():
#     return "Hello World!"

# if __name__ == '__main__':
#     app.run()

### **Создаем сервис для обработки запросов к модели**

In [48]:
# Загружаем обученные модели
with open('logreg_pipeline.dill', 'rb') as in_strm:
    model = dill.load(in_strm)

In [49]:
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")

Запустить сервис и не глушить его, пока работаем 

In [53]:
# Обработчики и запуск Flask
app = Flask(__name__)
run_with_ngrok(app)  # Start ngrok when app is run


@app.route("/", methods=["GET"])
def general():
    return "Welcome to prediction process"


@app.route('/predict', methods=['POST'])
def predict():
    data = {"success": False}

    # ensure an image was properly uploaded to our endpoint
    comment_text = ""
    request_json = request.get_json()
    
    if request_json['comment_text']:
        comment_text = request_json['comment_text']
    
    
    print(comment_text)  
    preds = model.predict_proba(pd.DataFrame({"comment_text": [comment_text],
                                             }))
    data["predictions"] = preds[:, 1][0]
    data["comment_text"] = comment_text
        # indicate that the request was a success
    data["success"] = True
    print('OK')

        # return the data dictionary as a JSON response
    return jsonify(data)


if __name__ == '__main__':
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://916f-34-123-104-113.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040
