In [None]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import svm
from sklearn.metrics import mean_absolute_error
from time import time
from scipy.stats import rankdata
from sklearn import preprocessing
from sklearn.linear_model import PassiveAggressiveRegressor
from scipy.stats import rankdata

In [None]:
# Load Ruddit scored dataset to get the scores
path = '../input/ruddit-papers-comments-scored/ruddit_comments_score.csv'
df = pd.read_csv(path)
y = df.pop('score')
print('Ruddit Dataset - Scores Loaded! Shape: ', y.shape)

In [None]:
# Load Ruddit scored dataset body on a setence2vec format
path = '../input/ruddit-comments-scored-raw-s2vnpy/ruddit_scored_body_s2v.npy' 
X_s2v = np.load(path)
print('Ruddit Dataset - Comments Loaded! Shape: ',X_s2v.shape)

In [None]:
# Load validation data
validation_data = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')
print('Validation Dataset Loaded! Shape: ',validation_data.shape)

In [None]:
# Split our dataset based on Ruddits dataset vectorized
X_train, X_test, y_train, y_test = train_test_split(X_s2v, y,
                                                    test_size=0.1,
                                                    random_state=0,
                                                    shuffle = True)

print('X_train.shape, X_test.shape, y_train.shape, y_test.shape')
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
# Train our model
# Using XGBoost trained on sentence2vector
xgb_regressor = XGBRegressor(n_estimators=1000, learning_rate=0.05, random_state=0)
xgb_regressor.fit(X_train, y_train, early_stopping_rounds=5,
             eval_set=[(X_test, y_test)], verbose=False)
predictions = xgb_regressor.predict(X_test)
mae = mean_absolute_error(predictions, y_test)
print('XGB Regressor - MAE: ', mae)

In [None]:
# Train our model
# Using SVM trained on sentence2vector
svm_s2v = svm.NuSVR()
svm_s2v.fit(X_train, y_train)
predictions = svm_s2v.predict(X_test)
mae = mean_absolute_error(predictions, y_test)
print('SVM NuSVR - MAE: ', mae)

In [None]:
# # Train our model using PassiveAgressiveRegressor
# par_s2v = PassiveAggressiveRegressor(max_iter=10000, random_state=0)
# par_s2v.fit(X_train, y_train)
# predictions = par_s2v.predict(X_test)
# mae = mean_absolute_error(y_test, predictions)
# print('PassiveAgressiveRegressor - MAE: ', mae)

In [None]:
# # Scaled our dataset for use in our models 
# scl_s2v = preprocessing.StandardScaler()
# scl_s2v.fit(X_train)
# X_train_scl = scl_s2v.transform(X_train)
# X_test_scl = scl_s2v.transform(X_test)
# print('Scaled X_Train and scaled X_test:', X_train_scl.shape, X_test_scl.shape)

In [None]:
# # Train our model
# # Using XGBoost trained on sentence2vector in scaled dataset
# xgb_regressor_scl = XGBRegressor(n_estimators=1000, learning_rate=0.05, random_state=0)
# xgb_regressor_scl.fit(X_train_scl, y_train, early_stopping_rounds=5,
#              eval_set=[(X_test_scl, y_test)], verbose=False)
# predictions = xgb_regressor_scl.predict(X_test_scl)
# mae = mean_absolute_error(predictions, y_test)
# print('XGB Regressor Escaled - MAE: ', mae)

In [None]:
# # Train our model
# # Using SVM trained on sentence2vector in scaled dataset
# svm_s2v_scl = svm.NuSVR()
# svm_s2v_scl.fit(X_train_scl, y_train)
# predictions = svm_s2v.predict(X_test_scl)
# mae = mean_absolute_error(predictions, y_test)
# print('SVM NuSVR Scaled - MAE: ', mae)

In [None]:
# # Train our model using PassiveAgressiveRegressor on scaled dataset
# par_s2v_scl = PassiveAggressiveRegressor(max_iter=10000, random_state=0)
# par_s2v_scl.fit(X_train_scl, y_train)
# predictions = par_s2v.predict(X_test_scl)
# mae = mean_absolute_error(y_test, predictions)
# print('PassiveAgressiveRegressor Scaled - MAE: ', mae)

In [None]:
# Transform our validation data into sentence2vec
# from sentence_transformers import SentenceTransformer
# st = SentenceTransformer('all-mpnet-base-v2')
# less_toxic_s2v = st.encode(validation_data.less_toxic, convert_to_numpy=True, show_progress_bar=True)
# more_toxic_s2v = st.encode(validation_data.more_toxic, convert_to_numpy=True, show_progress_bar=True)
# np.save('less_toxic_s2v', less_toxic_s2v)
# np.save('more_toxic_s2v', more_toxic_s2v)
# print('Less Toxic s2v Shape: ', less_toxic_s2v.shape)
# print('More Toxic s2v Shape: ', more_toxic_s2v.shape)

In [None]:
# # Test our XGB model in validation data
# xgb_less_toxic_pred = xgb_regressor.predict(less_toxic_s2v)
# xgb_more_toxic_pred = xgb_regressor.predict(more_toxic_s2v)
# print('Less Toxic Shape:', xgb_less_toxic_pred.shape, xgb_less_toxic_pred[:5])
# print('More Toxic Shape:', xgb_more_toxic_pred.shape, xgb_more_toxic_pred[:5])
# xgb_score = np.array(xgb_less_toxic_pred < xgb_more_toxic_pred).mean()
# print('XGB Score on Validation Data:', xgb_score)

In [None]:
# # Test our SVM NuSVR model in validation data
# svm_less_toxic_pred = svm_s2v.predict(less_toxic_s2v)
# svm_more_toxic_pred = svm_s2v.predict(more_toxic_s2v)
# print('Less Toxic Shape:', svm_less_toxic_pred.shape, svm_less_toxic_pred[:5])
# print('More Toxic Shape:', svm_more_toxic_pred.shape, svm_more_toxic_pred[:5])
# svm_score = np.array(svm_less_toxic_pred < svm_more_toxic_pred).mean()
# print('SVM NuSVR Score on Validation Data:', svm_score)

In [None]:
# # Test our PassiveAgressive model in validation data
# par_less_toxic_pred = par_s2v.predict(less_toxic_s2v)
# par_more_toxic_pred = par_s2v.predict(more_toxic_s2v)
# print('Less Toxic Shape:', par_less_toxic_pred.shape, par_less_toxic_pred[:5])
# print('More Toxic Shape:', par_more_toxic_pred.shape, par_more_toxic_pred[:5])
# par_score = np.array(par_less_toxic_pred < par_more_toxic_pred).mean()
# print('PassiveAgressive Score on Validation Data:', par_score)

In [None]:
# # Load comments to score
c2s = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
# # Transform using sentence2vec Transformer
# c2s_s2v = st.encode(c2s.text, convert_to_numpy=True, show_progress_bar=True)

# Load pre-transformed comments to score
c2s_s2v = np.load('../input/comments-to-score/comments_to_score_s2v.npy')
scores = svm_s2v.predict(c2s_s2v)

rank_ord = rankdata(scores, method='ordinal')

# Create our output
output = pd.DataFrame({'comment_id': c2s.comment_id, 'score':rank_ord})
output.to_csv('submission.csv', index=False)