In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv")
data.head()

In [None]:
data_less = pd.DataFrame()
data_more = pd.DataFrame()
data_less['context'] = data['less_toxic']
data_more['context'] = data['more_toxic']

In [None]:
def labels(data,column,label):
    for i in range(data[column].count()):
        return label
data_less['labels'] = labels(data=data_less,column='context',label='less_toxic')
data_more['labels'] = labels(data=data_more,column='context',label='more_toxic')

data = pd.concat([data_less,data_more])
data = data.sample(frac=1)
data.head()

In [None]:
import re 
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(::|;|=)(:-)(:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

data['context'] = data['context'].apply(preprocessor)
data.head()

In [None]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):
    return text.split()

def tokenizer_port(text):
    return [porter.stem(word) for word in text.split()]

In [None]:
from sklearn.model_selection import train_test_split

X = data['context']
y = data['labels']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words("english")

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

tfidf = TfidfVectorizer(strip_accents=None,
                       lowercase=False,
                       preprocessor=None)
    
param_grid = [{"vect__ngram_range":[(1,1)],
              "vect__stop_words":[stop,None],
              "clf__penalty":['l1','l2'],
              "clf__C":[1.0,10.0,100.0],
              "vect__tokenizer":[tokenizer,tokenizer_port]},
             {"vect__ngram_range":[(1,1)],
             "vect__stop_words":[stop,None],
             "vect__tokenizer":[tokenizer,tokenizer_port],
             "vect__norm":[None],
             "vect__use_idf":[False],
             "clf__penalty":['l1','l2'],
             "clf__C":[1.0,10.0,100.0]}]

lr_tfidf = Pipeline([("vect",tfidf),
                    ("clf",LogisticRegression(random_state=0,solver='liblinear'))])

gs_lr_tfidf = GridSearchCV(lr_tfidf,param_grid,scoring='accuracy',
                          cv=5,
                          verbose=1,
                          n_jobs=-1)

In [None]:
gs_lr_tfidf.fit(X_train,y_train)

In [None]:
model = gs_lr_tfidf.best_estimator_
print("Train accuracy: ",model.score(X_train,y_train))
print("Test accuracy: ",model.score(X_test,y_test))

In [None]:
from sklearn.svm import SVC

param_grid = [{'vect__ngram_range':[(1,1)],
               'vect__stop_words':[None,stop],
               'vect__tokenizer':[tokenizer,tokenizer_port],
               'svm__C':[1.0,2.0,3.0,4.0,5.0],
               'svm__kernel':['linear','rbf'],
               'svm__gamma':['auto','scale'],
               },
              {'vect__ngram_range':[(1,1)],
               'vect__stop_words':[None,stop],
               'vect__tokenizer':[tokenizer,tokenizer_port],
               'vect__norm':[None],
               'vect__use_idf':[False],
               'svm__C':[1.0,2.0,3.0,4.0,5.0,6.0],
               'svm__kernel':['linear','rbf'],
               'svm__gamma':['auto','scale']}]

ps_svm_tf = Pipeline([("vect",tfidf),
                      ("svm",SVC(random_state=1))])

grid_svm_tf = GridSearchCV(ps_svm_tf,param_grid,
                           cv=5,
                           verbose=1,
                           n_jobs=-1,
                           scoring='accuracy')

In [None]:
test_data = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv")
test_data.head()

In [None]:
test_data['text'] = test_data['text'].apply(preprocessor)
test_data.head()

In [None]:
predictions = model.predict(test_data['text'])
test_data['score'] = predictions

In [None]:
submission = test_data.drop('text',axis=1)
submission.to_csv("submission.csv",index=False)

In [None]:
!ls