In [None]:
# libraries
from google.colab import drive
drive.mount('/content/gdrive')

import numpy as np 
import pandas as pd 

import re
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

import nltk
from nltk.corpus import stopwords                 
from nltk.stem.wordnet import WordNetLemmatizer    
from nltk import word_tokenize, pos_tag           

# stopwords for preprocessing
nltk.download('stopwords') 
stopwords_english = stopwords.words('english') 

import seaborn as sns
import matplotlib.pyplot as plt
import os

# data 
# replace your file path here
train = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Data/train.csv') 
test = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Data/test.csv')
test_label = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Data/test_labels.csv')

# data info
train.info()
train.head(10)

# the % of toxic comments
test_label = test_label.loc[test_label['toxic']!=-1]
test_label.iloc[:,1:-1].sum(axis=0) / test_label.shape[0]

# preprocess the comments : make text lowercase, remove hyperlinks, punctuation, digits, and stopwords.
def preprocess(comment):

    for word in comment:

        word = word.lower()                                               # lowercase
        word = re.sub(r'https?://[^\s\n\r]+', '', word)                   # remove links
        word = re.sub('[%s]' % re.escape(string.punctuation), '', word)   # remove punctuation
        word = re.sub(r'\d', '', word)                                    # remove digits
    
        yield ' '.join([s for s in word.split(' ') if word not in stopwords_english])

# list of clean comments
clean_comments = list(preprocess(train['comment_text']))

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


'\n# logistic regression\nlabels = [\'toxic\', \'severe_toxic\', \'obscene\', \'threat\',\'insult\', \'identity_hate\']\n\nvectorizer = TfidfVectorizer(analyzer=\'word\',\n                            stop_words=\'english\',\n                            ngram_range=(1, 3),\n                            max_features=30000,\n                            sublinear_tf=True)\nX_train = vectorizer.fit_transform(train.comment_text)\nX_test = vectorizer.transform(test.comment_text)\nY_train = train[labels]\n\nsubmission = pd.DataFrame.from_dict({\'id\': test[\'id\']})\n\nscores = []\n\nfor label in labels:\n    #build classifier\n    LR = LogisticRegression(solver=\'saga\', n_jobs=-1, C=0.5)\n    \n    #compute cv score\n    cv_score = np.mean(cross_val_score(LR, X_train, Y_train[label], cv=3, n_jobs=-1, scoring=\'roc_auc\'))\n    scores.append(cv_score)\n    print("CV score for class {} is {}".format(label, cv_score))\n    \n    #re-learn & predict\n    LR.fit(X_train, Y_train[label])  \n    sub

In [None]:
train.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [None]:
test_label = test_label.loc[test_label['toxic']!=-1]
test_label.iloc[:,1:-1].sum(axis=0) / test_label.shape[0]

toxic           0.095189
severe_toxic    0.005736
obscene         0.057692
threat          0.003298
insult          0.053565
dtype: float64

In [None]:
for i in range(10):
  print(clean_comments[i])

explanation
why edits made username hardcore metallica fan reverted werent vandalisms closure gas voted new york dolls fac please dont remove template talk page since im retired 
daww matches background colour im seemingly stuck thanks  talk  january   utc
hey man im really trying edit war guy constantly removing relevant information talking edits instead talk page seems care formatting actual info

more
i cant make real suggestions improvement  wondered section statistics later subsection types accidents  think references may need tidying exact format ie date format etc later noone else first  preferences formatting style references want please let know

there appears backlog articles review guess may delay reviewer turns listed relevant form eg wikipediagoodarticlenominationstransport  
sir hero chance remember page thats


congratulations well use tools well  · talk 
cocksucker piss around work
vandalism matt shirvington article reverted  please dont banned
sorry word nonsense offen

In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153164 entries, 0 to 153163
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            153164 non-null  object
 1   comment_text  153164 non-null  object
dtypes: object(2)
memory usage: 2.3+ MB


In [None]:
test_label.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 63978 entries, 5 to 153156
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             63978 non-null  object
 1   toxic          63978 non-null  int64 
 2   severe_toxic   63978 non-null  int64 
 3   obscene        63978 non-null  int64 
 4   threat         63978 non-null  int64 
 5   insult         63978 non-null  int64 
 6   identity_hate  63978 non-null  int64 
dtypes: int64(6), object(1)
memory usage: 3.9+ MB


In [None]:
test.head(10)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.
5,0001ea8717f6de06,Thank you for understanding. I think very high...
6,00024115d4cbde0f,Please do not add nonsense to Wikipedia. Such ...
7,000247e83dcc1211,:Dear god this site is horrible.
8,00025358d4737918,""" \n Only a fool can believe in such numbers. ..."
9,00026d1092fe71cc,== Double Redirects == \n\n When fixing double...


In [None]:
test_label.head(10)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
5,0001ea8717f6de06,0,0,0,0,0,0
7,000247e83dcc1211,0,0,0,0,0,0
11,0002f87b16116a7f,0,0,0,0,0,0
13,0003e1cccfd5a40a,0,0,0,0,0,0
14,00059ace3e3e9a53,0,0,0,0,0,0
16,000663aff0fffc80,0,0,0,0,0,0
17,000689dd34e20979,0,0,0,0,0,0
19,000844b52dee5f3f,0,0,0,0,0,0
21,00091c35fa9d0465,1,0,0,0,0,0
22,000968ce11f5ee34,0,0,0,0,0,0
