# TFIDF + RIDGE Method

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Imports

This time i'm gonna try the TFIDF + RIDGE technique with classify data.

In [None]:
import random

from pathlib import Path

# for vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# for regression model
from sklearn.linear_model import Ridge

## Data Pre-Processing

**Read the Files**

In [None]:
# define the file paths
train_path = '../input/jigsaw-toxic-comment-classification-challenge/train.csv'
test_path = '../input/jigsaw-toxic-severity-rating/comments_to_score.csv'
valid_path = '../input/jigsaw-toxic-severity-rating/validation_data.csv'
submission_path = '../input/jigsaw-toxic-severity-rating/sample_submission.csv'

In [None]:
# read the files
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
valid = pd.read_csv(valid_path)
submission = pd.read_csv(submission_path)

**ADD Scoring**

In [None]:
train.columns

Add scoring to different types of toxic comments (manually).

In [None]:
scoring = {'obscene':0.16, 'toxic':0.32, 'threat':1.5, 'insult':0.64,
            'severe_toxic':1.5, 'identity_hate':1.5}

In [None]:
# modify the training data (add the scoring factor)

for cat in scoring:
    train[cat] = train[cat] * scoring[cat]

In [None]:
# add all the scores to one column
train['score'] = train.loc[:, 'toxic':'identity_hate'].sum(axis=1)

In [None]:
train[train['score'] > 0]

## Data Processing

Get the labels and the comments data from training data. Then apply vectorization to the text data to make it suitable for training. 

In [None]:
# get the labels and comments
scores = train['score']
comments = train['comment_text']

In [None]:
# vectorize the comments
vectorizer = TfidfVectorizer(min_df=3, max_df=0.5, 
                             analyzer='char_wb', ngram_range=(3, 5))

comments_vec = vectorizer.fit_transform(comments)

In [None]:
comments_vec

## Model

We will create a Ridge model to predict the scores for the comments. Since predicting the scores is a regressin problem hence we can use regressor model.

In [None]:
regressor = Ridge(random_state=99, alpha=0.8)

## Training

Fit the regression model on the toxic comments scoring data. We will use the vectorized comments and corresponding scores.

In [None]:
regressor.fit(comments_vec, scores)

## Inference

**Process Test data**

Here we will make predictions on the test data (this is what we are asked for). First we have to vectorize the comments.

In [None]:
# vectorize the comments to score on
comments_to_predict = vectorizer.transform(test['text'])

In [None]:
comments_to_predict

**Prediction**

In [None]:
# predict the scores using regressor
preds = regressor.predict(comments_to_predict)

In [None]:
preds, len(preds)

Now, create a file to submit scores.

In [None]:
# replace score with our predicted score

submission['score'] = preds

In [None]:
submission.to_csv("submission.csv", index=False)

In [None]:
ans = pd.read_csv("./submission.csv")
ans.head(2)