In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

PRE_TRAIN_DATA_PATH = os.getenv("PRE_TRAIN_DATA_PATH")
PRE_TEST_DATA_PATH = os.getenv("PRE_TEST_DATA_PATH")
TEST_LABELS_PATH = os.getenv("TEST_LABELS_PATH")
LOG_REG_PATH = os.getenv("LOG_REG_PATH")
DECISION_TREE_PATH = os.getenv("DECISION_TREE_PATH")
VECTORIZER_PATH = os.getenv("VECTORIZER_PATH")
RANDOM_FOREST_PATH = os.getenv("RANDOM_FOREST_PATH")

In [None]:
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.decomposition import TruncatedSVD

In [4]:
train_df = pd.read_csv(str(PRE_TRAIN_DATA_PATH))
test_df = pd.read_csv(str(PRE_TEST_DATA_PATH))

In [5]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

X_train = train_df['lemmatized'].fillna("").astype(str)
y_train = train_df[labels]

X_test = test_df['lemmatized'].fillna("").astype(str)
y_test = test_df[labels]

In [6]:
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [7]:
log_clf = MultiOutputClassifier(LogisticRegression(max_iter=1000, class_weight='balanced'))
log_clf.fit(X_train_vec, y_train)

In [8]:
y_pred_log = log_clf.predict(X_test_vec)
print(classification_report(y_test, y_pred_log, target_names=labels))

               precision    recall  f1-score   support

        toxic       0.41      0.92      0.57      6090
 severe_toxic       0.10      0.92      0.19       367
      obscene       0.42      0.91      0.57      3691
       threat       0.12      0.87      0.22       211
       insult       0.34      0.90      0.50      3427
identity_hate       0.17      0.86      0.28       712

    micro avg       0.34      0.91      0.49     14498
    macro avg       0.26      0.90      0.39     14498
 weighted avg       0.37      0.91      0.52     14498
  samples avg       0.06      0.09      0.07     14498



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
svd = TruncatedSVD(n_components=50,random_state=42)
X_train_svd = svd.fit_transform(X_train_vec)
X_test_svd = svd.transform(X_test_vec)

In [10]:
tree_clf = MultiOutputClassifier(DecisionTreeClassifier(class_weight='balanced', max_depth=15))
tree_clf.fit(X_train_vec, y_train)

In [11]:
y_pred_tree = tree_clf.predict(X_test_vec)
print(classification_report(y_test, y_pred_tree, target_names=labels))

               precision    recall  f1-score   support

        toxic       0.56      0.59      0.57      6090
 severe_toxic       0.08      0.83      0.15       367
      obscene       0.48      0.74      0.58      3691
       threat       0.05      0.76      0.10       211
       insult       0.41      0.68      0.51      3427
identity_hate       0.12      0.71      0.20       712

    micro avg       0.33      0.66      0.44     14498
    macro avg       0.28      0.72      0.35     14498
 weighted avg       0.46      0.66      0.52     14498
  samples avg       0.04      0.06      0.04     14498



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
forest_clf = MultiOutputClassifier(RandomForestClassifier(n_estimators=10,n_jobs=-1))
forest_clf.fit(X_train_svd, y_train)

In [13]:
y_pred_forest = forest_clf.predict(X_test_svd)
print(classification_report(y_test, y_pred_forest, target_names=labels))

               precision    recall  f1-score   support

        toxic       0.67      0.51      0.58      6090
 severe_toxic       0.25      0.14      0.18       367
      obscene       0.72      0.54      0.62      3691
       threat       0.22      0.02      0.03       211
       insult       0.62      0.40      0.49      3427
identity_hate       0.46      0.06      0.11       712

    micro avg       0.66      0.45      0.54     14498
    macro avg       0.49      0.28      0.34     14498
 weighted avg       0.64      0.45      0.53     14498
  samples avg       0.05      0.04      0.04     14498



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
joblib.dump(vectorizer,str(VECTORIZER_PATH))
joblib.dump(log_clf,str(LOG_REG_PATH))
joblib.dump(tree_clf,str(DECISION_TREE_PATH))
joblib.dump(forest_clf,str(RANDOM_FOREST_PATH))

['/Users/ronakpanchal/Desktop/College/Artificial intelligence /AI Project/models/random_forest.pkl']