In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd

import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

In [None]:
train = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv')
print(train.shape)
train.head(5)

less_data = train[['worker', 'less_toxic']]
less_data['sentiment'] = list(np.repeat(1, len(less_data)))
less_data.rename(columns = {'less_toxic' : 'comments'}, inplace = True)

more_data = train[['worker', 'more_toxic']]
more_data['sentiment'] = list(np.repeat(0, len(more_data)))
more_data.rename(columns = {'more_toxic' : 'comments'}, inplace = True)

final_data = pd.concat([less_data, more_data], axis = 0)
final_data.reset_index(inplace = True, drop = True)

final_data['comments'] = final_data['comments'].apply(lambda x : re.sub("[^a-zA-Z]", " " , x))
final_data.reset_index(drop = True, inplace = True)

counted = pd.DataFrame(final_data['comments'].value_counts()).reset_index()
counted.rename(columns = {'comments':'counts', 'index' : 'comments'}, inplace = True)

final_data.drop_duplicates(['comments'], inplace = True)
new_final_data = pd.merge(final_data, counted, left_on = 'comments', right_on = 'comments', how = 'left')

X = new_final_data[['worker', 'comments', 'counts']]
Y = new_final_data['sentiment']
X_train, X_test, y_train, y_test= train_test_split(X, Y, test_size=0.3, random_state=156)

# 스톱 워드는 English, filtering, ngram은 (1,2)로 설정해 CountVectorization수행. 
# LogisticRegression의 C는 10으로 설정. 
pipeline = Pipeline([
    ('cnt_vect', TfidfVectorizer(stop_words='english', ngram_range=(1,2))),
    ('lr_clf', LogisticRegression(C=10))])

# Pipeline 객체를 이용하여 fit(), predict()로 학습/예측 수행. predict_proba()는 roc_auc때문에 수행.  
pipeline.fit(X_train['comments'], y_train)
pred = pipeline.predict(X_test['comments'])
pred_probs = pipeline.predict_proba(X_test['comments'])[:,1]

print('예측 정확도는 {0:.4f}, ROC-AUC는 {1:.4f}'.format(accuracy_score(y_test ,pred),
                                         roc_auc_score(y_test, pred_probs)))

In [None]:
test = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv')
print(test.shape)
test.head(5)

test['text'] = test['text'].apply(lambda x : re.sub("[^a-zA-Z]", " " , x))
test.reset_index(drop = True, inplace = True)

final_pred = pipeline.predict(test['text'])

submission = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/sample_submission.csv')
submission['score'] = list(final_pred)

submission.to_csv('./submission.csv', index = False)