In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

In [3]:
data_train = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
print(data_train)
data_test = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')

test_id = data_test['id']
y_train = data_train.iloc[:,2:8]
y_train

                      id                                       comment_text  \
0       0000997932d777bf  Explanation\nWhy the edits made under my usern...   
1       000103f0d9cfb60f  D'aww! He matches this background colour I'm s...   
2       000113f07ec002fd  Hey man, I'm really not trying to edit war. It...   
3       0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...   
4       0001d958c54c6e35  You, sir, are my hero. Any chance you remember...   
...                  ...                                                ...   
159566  ffe987279560d7ff  ":::::And for the second time of asking, when ...   
159567  ffea4adeee384e90  You should be ashamed of yourself \n\nThat is ...   
159568  ffee36eab5c267c9  Spitzer \n\nUmm, theres no actual article for ...   
159569  fff125370e4aaaf3  And it looks like it was actually you who put ...   
159570  fff46fc426af1f9a  "\nAnd ... I really don't think you understand...   

        toxic  severe_toxic  obscene  threat  insul

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
...,...,...,...,...,...,...
159566,0,0,0,0,0,0
159567,0,0,0,0,0,0
159568,0,0,0,0,0,0
159569,0,0,0,0,0,0


In [4]:
StopWords = set(stopwords.words('english'))

def text_preprocess(text):
    trans = str.maketrans('','',string.punctuation)
    text = text.translate(trans)
    text = ' '.join([word.lower() for word in text.split() if word.lower() not in StopWords])
    return text

data_train['comment_text'] = data_train['comment_text'].apply(text_preprocess)
data_test['comment_text'] = data_test['comment_text'].apply(text_preprocess)
X_train = data_train['comment_text']
X_test = data_test['comment_text']
print(X_test.head())
X_train.head()

0    yo bitch ja rule succesful youll ever whats ha...
1                                   rfc title fine imo
2                        sources zawe ashton lapland —
3    look back source information updated correct f...
4                       dont anonymously edit articles
Name: comment_text, dtype: object


0    explanation edits made username hardcore metal...
1    daww matches background colour im seemingly st...
2    hey man im really trying edit war guy constant...
3    cant make real suggestions improvement wondere...
4                  sir hero chance remember page thats
Name: comment_text, dtype: object

In [5]:
X_train = X_train.tolist()
X_test = X_test.tolist()

def lemmatize(data):
    lemmatizer = WordNetLemmatizer()
    data_lemm = []
    for text in data:
        lem_text = ''
        for word in text.split():
            lem_word = lemmatizer.lemmatize(word)
            lem_word = lemmatizer.lemmatize(lem_word, pos='v')
            lem_text = lem_text + ' ' + lem_word
        data_lemm.append(lem_text)
    return data_lemm

In [6]:
X_train_lemm = lemmatize(X_train)
X_test_lemm = lemmatize(X_test)

In [7]:
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=5, max_df=0.9)
X_train_tfidf = tfidf.fit_transform(X_train_lemm)
X_test_tfidf = tfidf.transform(X_test_lemm)

In [8]:
clf = OneVsRestClassifier(LogisticRegression(penalty='l2',C=1)).fit(X_train_tfidf, y_train)
clf.predict(X_test_tfidf)
y_pred = clf.predict_proba(X_test_tfidf)
print(y_pred)
y_pred[:,0]

[[0.99333458 0.10118083 0.98028368 0.02105219 0.89822238 0.17105504]
 [0.00723269 0.00307916 0.00567371 0.00182755 0.00893976 0.00316744]
 [0.02836608 0.00345581 0.01158567 0.00145261 0.01233125 0.00291858]
 ...
 [0.00571376 0.00197853 0.00578735 0.00156742 0.00470868 0.00281136]
 [0.02919319 0.00318303 0.0168498  0.00251755 0.01618661 0.01561673]
 [0.75827128 0.00327636 0.33121754 0.00347637 0.13387219 0.00708002]]


array([0.99333458, 0.00723269, 0.02836608, ..., 0.00571376, 0.02919319,
       0.75827128])

In [9]:
output_df = pd.DataFrame()
output_df['id'] = test_id
output_df['toxic'] = y_pred[:,0]
output_df['severe_toxic'] = y_pred[:,1]
output_df['obscene'] = y_pred[:,2]
output_df['threat'] = y_pred[:,3]
output_df['insult'] = y_pred[:,4]
output_df['identity_hate'] = y_pred[:,5]
print(output_df)
output_df.to_csv('Submission.csv', index=False)

                      id     toxic  severe_toxic   obscene    threat  \
0       00001cee341fdb12  0.993335      0.101181  0.980284  0.021052   
1       0000247867823ef7  0.007233      0.003079  0.005674  0.001828   
2       00013b17ad220c46  0.028366      0.003456  0.011586  0.001453   
3       00017563c3f7919a  0.006967      0.002436  0.004244  0.001255   
4       00017695ad8997eb  0.051816      0.002357  0.015378  0.001442   
...                  ...       ...           ...       ...       ...   
153159  fffcd0960ee309b5  0.065785      0.002803  0.011572  0.001613   
153160  fffd7a9a6eb32c16  0.089606      0.006121  0.027707  0.004525   
153161  fffda9e8d6fafa9e  0.005714      0.001979  0.005787  0.001567   
153162  fffe8f1340a79fc2  0.029193      0.003183  0.016850  0.002518   
153163  ffffce3fb183ee80  0.758271      0.003276  0.331218  0.003476   

          insult  identity_hate  
0       0.898222       0.171055  
1       0.008940       0.003167  
2       0.012331       0.002919  