In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### This is base line model. Going ahead is trying to improve this score by coming up with new model or changing the pre-processing steps. 

### Please do suggest if you have a different approach and I will try it out

In [None]:
## For feature engineering
import pandas as pd
import numpy as np

## Cleaning text
import string
from nltk.corpus import stopwords
import random

## For vector creation and modelling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")

In [None]:
df.head()

In [None]:
df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) > 0 ).astype(int)
df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
df.sample(5)

In [None]:
## Reading file - Validation to score
val = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')
## Reading file - Comments to scote
cs = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

In [None]:
val.head()

In [None]:
cs.head()

In [None]:
## Checking the count of y to see if the data is balanced or skewed
df["y"].value_counts()

Number of 0 are greater than 1 so we will undersample the data to match 0 and 1 by random sampling approach

In [None]:
df_sampled = df[df["y"] == 0].sample(n = 16225)

In [None]:
df = pd.concat([df[df["y"] == 1], df_sampled])

In [None]:
df["y"].value_counts()

### Cleaning and Text Processing

##### Removing Punctuations

In [None]:
PUNCT_REMOVAL = string.punctuation
PUNCT_REMOVAL

In [None]:
## Function to remove punctuations
def remove_punctuation(text):
    return text.translate(str.maketrans("","", PUNCT_REMOVAL))

##### Remove Stopwords

In [None]:
## Reading stopwords
STOPWORDS = set(stopwords.words('english'))

In [None]:
## Function to remove stopwords
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

In [None]:
df["without punc"] = df["text"].apply(lambda x: remove_punctuation(x))
df["without stopwords"] = df["without punc"].apply(lambda x: remove_punctuation(x))

### Converting text to number format - (Tf-idf)

In [None]:
## Defining Tf-idf vector
vec = TfidfVectorizer()

In [None]:
## Creating features
X = vec.fit_transform(df['text'])
y = df['y']

### Fitting the model

In [None]:
## Splitting to check how model is performing on validation set
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,test_size=0.2)

In [None]:
## Building/defining the model
SVM = svm.SVC(kernel = 'linear', gamma = 'auto', probability = True)

In [None]:
## Fitting the model
SVM.fit(X_train, y_train)

In [None]:
## Getting the result
pred = SVM.predict(X_test)

In [None]:
## Looking at the Classification matrix
print(classification_report(y_test, pred))

### Important Logs
1. Model trained on unigram
3. Model takes long time to run given its SVM

### Submission

In [None]:
cs.head()

In [None]:
## Creating new columns with punctuations and stopwords removed
cs["no_punct"] = cs["text"].apply(lambda x: remove_punctuation(x))
cs["no_stopwords"] = cs["no_punct"].apply(lambda x: remove_stopwords(x))
cs.head()

In [None]:
X = vec.transform(cs['no_stopwords'])

In [None]:
## Getting the result
pred_proba = SVM.predict_proba(X)

In [None]:
pred_proba

In [None]:
pred_proba[:, 1]

In [None]:
cs['score'] = pred_proba[:, 1]

In [None]:
cs[['comment_id', 'score']].to_csv("submission.csv", index = False)