In [21]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import nltk
from sklearn.feature_extraction.text import CountVectorizer
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import List
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score
from sklearn.pipeline import Pipeline


In [2]:
data = pd.read_csv('../data/trainToxic.csv')

In [3]:

STOPWORDS = set(stopwords.words('english') )
lemmatizer = WordNetLemmatizer()
PUNCT_DIGIT_REGEX = re.compile(f"[{re.escape(string.punctuation)}0-9\r\t\n]")

In [4]:
from typing import List

def Tokenization(text) -> List[str]:
    text = text.lower()
    text = PUNCT_DIGIT_REGEX.sub(" ", text)
    words = [w.encode('ascii','ignore').decode('ascii') for w in text.split()]
    words = [lemmatizer.lemmatize(word) for word in words if word not in STOPWORDS]
    # words = [words for words in text.split() if words not in STOPWORDS]

    words = [word for word in words if len(word) > 2]

    return words

In [26]:
TEXT_COL = "comment_text"
LABELS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

X_train,X_test,y_train,y_test = train_test_split(data[TEXT_COL], data[LABELS], test_size=0.3, random_state=42 )


In [27]:
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [28]:
tfidf = TfidfVectorizer(
    tokenizer=Tokenization,  
    ngram_range=(1, 3),
    min_df=5,
    max_df=0.9,
    max_features=100000
)

X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)



Modelling and Trying the data


In [29]:
scoring = {
    "f1_micro": make_scorer(f1_score, average="micro"),
    "f1_macro": make_scorer(f1_score, average="macro"),
    "precision_micro": make_scorer(precision_score, average="micro"),
    "recall_micro": make_scorer(recall_score, average="micro"),
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

models = {
    "Logistic Regression": OneVsRestClassifier(
        LogisticRegression(
            C=1.0,
            penalty="l2",
            solver="saga",
            max_iter=2000,
            random_state=42
        )
    ),
    "Naive Bayes": OneVsRestClassifier(
        MultinomialNB(alpha=0.1)
    ),
    "LinearSVC": OneVsRestClassifier(
        LinearSVC(C=1.0, random_state=42,class_weight='balanced', max_iter=2000)
    )
}

res = []

for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    
    pipeline = Pipeline([
       
        ("clf", model)
    ])
    
    scores = cross_validate(
        pipeline,
        X_train,
        y_train,
        cv=kf,
        scoring=scoring,
        n_jobs=-1
    )
    
    res.append({
        "Model": model_name,
        "F1 Micro": scores["test_f1_micro"].mean(),
        "F1 Macro": scores["test_f1_macro"].mean(),
        "Precision Micro": scores["test_precision_micro"].mean(),
        "Recall Micro": scores["test_recall_micro"].mean(),
    })

results_df = pd.DataFrame(res).sort_values(by="F1 Micro", ascending=False)

print(results_df)


Training Logistic Regression...

Training Naive Bayes...

Training LinearSVC...
                 Model  F1 Micro  F1 Macro  Precision Micro  Recall Micro
2            LinearSVC  0.690667  0.563386         0.648123      0.739293
0  Logistic Regression  0.631242  0.424177         0.893295      0.488184
1          Naive Bayes  0.618259  0.394948         0.791440      0.507293


F1 are good of just Linear SVC here but not great

Not Some great Numbers but average now 
