# Import

In [None]:
import numpy as np
import pandas as pd
import re
import string
import time


# Read data

In [None]:
df_train = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
print(df_train.shape)
df_train.head()

In [None]:
df_test = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
print(df_test.shape)
df_test.head()

In [None]:
cols_target = ['obscene','insult','toxic','severe_toxic','identity_hate','threat']

In [None]:
len(df_test['comment_text'].values)
# X_vec_train

# Pre-processing

In [None]:
def clean_text(text):
    
    text = text.lower()
    
    #pattern = [zero or more character]
    text = re.sub('\[.*?\]', '', text)
    
    #pattern = with or without(http),://, one or more non-white space character, OR www, .,one or more non-white space character
    text = re.sub('https?://\S+|www\.\S+', '', text)
    
    #pattern = <, zero or more characters, >, (one or more occurance of >)
    text = re.sub('<.*?>+', '', text)
    
    #pattern = any punctionation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    
    #pattern = any new line
    text = re.sub('\n', '', text)
    
    #pattern = any from[a-zA-Z0-9_], any from[0-9], any from [a-zA-Z0-9_]
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
df_train['clean_text'] = df_train['comment_text'].apply(str).apply(lambda x: clean_text(x))
df_test['clean_text'] = df_test['comment_text'].apply(str).apply(lambda x: clean_text(x))

# Tf-idf vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X_vec_train = vectorizer.fit_transform(df_train['clean_text'])
X_vec_test = vectorizer.transform(df_test['clean_text'])

# Classification

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
logreg = LogisticRegression(C=12.0, solver='liblinear')

# create submission file
df_submission = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')

for label in cols_target:
    print('> Processing ' + label)
    y = df_train[label]
    # train the model using X_dtm & y
    logreg.fit(X_vec_train, y)
    # compute the training accuracy
    y_pred_X = logreg.predict(X_vec_test)
    print('Training accuracy is ' + str(accuracy_score(y[:len(y_pred_X)], y_pred_X)))
    print('-'*10)
    # compute the predicted probabilities for X_test_dtm
    test_y_prob = logreg.predict_proba(X_vec_test)[:,1]
    df_submission[label] = test_y_prob

# Submit

In [None]:
df_submission.head()

In [None]:
df_submission.to_csv('submission.csv',index=False)