In [None]:
import pandas as pd
import random 
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
random.seed(0)
df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) > 0 ).astype(int)
df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
df.head(4)

## Feature Extraction

In [None]:
vec = TfidfVectorizer()
X = vec.fit_transform(df['text'])

In [None]:
classifier = RandomForestClassifier(n_estimators = 15, criterion = 'entropy', random_state = 0)
classifier.fit(X,df['y'])

In [None]:
# Predicting the Test set results
score = classifier.score(X,df['y'])

In [None]:
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")

In [None]:
X_less_toxic = vec.transform(df_val['less_toxic'])
X_more_toxic = vec.transform(df_val['more_toxic'])

In [None]:
p1 = classifier.predict_proba(X_less_toxic)
p2 = classifier.predict_proba(X_more_toxic)

In [None]:
# Validation Accuracy
(p1[:, 1] < p2[:, 1]).mean()

## Submission

In [None]:
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
X_test = vec.transform(df_sub['text'])
p3 = classifier.predict_proba(X_test)

In [None]:
df_sub['score'] = p3[:, 1]

In [None]:
print(df_sub['score'].count())
print(df_sub['score'].nunique())

In [None]:
sub=df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)