# Make Dataset

In [1]:
import pandas as pd

# Load dataset from train.csv
df_train = pd.read_csv('data/train.csv')

df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [2]:
# Toxicity columns
toxicity_columns = [
    'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
]

# 1. Removal of Stop Words

Stop words are the words which are commonly used and removed from the sentence as pre-step in different Natural Language Processing (NLP) tasks. Example of stop words are: ‘a’, ‘an’, ‘the’, ‘this’, ‘not’ etc. Every tool uses a bit different set of stop words list that it removes but this technique is avoided in cases where phrase structure matters like in this case of Sentiment Analysis.

In [3]:
from gensim.parsing.preprocessing import remove_stopwords

df_train['text_stopwords_removed'] = df_train['comment_text'].apply(remove_stopwords)

# 2. Tokenization

Tokenization is the process in which the sentence/text is split into array of words called tokens. This helps to do transformations on each words separately and this is also required to transform words to numbers. There are different ways of performing tokenization. I have explained these ways in my previous post under Tokenization section, so if you are interested you can check it out.

In [4]:
from gensim.utils import simple_preprocess
# Tokenize the text column to get the new column 'tokenized_text'
df_train['tokenized_text_without_stopwords'] = [simple_preprocess(line, deacc=True) for line in df_train['text_stopwords_removed']]

# Create a is_toxic column: 1 if any label is 1, else 0
df_train['result'] = df_train[toxicity_columns].max(axis=1)

df_train[['tokenized_text_without_stopwords', 'text_stopwords_removed']].head(10)

Unnamed: 0,tokenized_text_without_stopwords,text_stopwords_removed
0,"[explanation, why, edits, username, hardcore, ...",Explanation Why edits username Hardcore Metall...
1,"[aww, he, matches, background, colour, seeming...",D'aww! He matches background colour I'm seemin...
2,"[hey, man, trying, edit, war, it, guy, constan...","Hey man, I'm trying edit war. It's guy constan..."
3,"[more, can, real, suggestions, improvement, wo...",""" More I can't real suggestions improvement - ..."
4,"[you, sir, hero, any, chance, remember, page, ...","You, sir, hero. Any chance remember page that'..."
5,"[congratulations, well, use, tools, well, talk]",""" Congratulations well, use tools well. · talk """
6,"[cocksucker, before, you, piss, around, on, my...",COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK
7,"[your, vandalism, matt, shirvington, article, ...",Your vandalism Matt Shirvington article revert...
8,"[sorry, word, nonsense, offensive, you, anyway...","Sorry word 'nonsense' offensive you. Anyway, I..."
9,"[alignment, subject, contrary, dulithgow]",alignment subject contrary DuLithgow


# 3. Stemming

Stemming process reduces the words to its’ root word. Unlike Lemmatization which uses grammar rules and dictionary for mapping words to root form, stemming simply removes suffixes/prefixes. Stemming is widely used in the application of SEOs, Web search results, and information retrieval since as long as the root matches in the text somewhere it helps to retrieve all the related documents in the search.

In [5]:
from gensim.parsing.porter import PorterStemmer
porter_stemmer = PorterStemmer()

df_train['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in df_train['tokenized_text_without_stopwords'] ]

df_train[['stemmed_tokens', 'tokenized_text_without_stopwords']].head(10)

Unnamed: 0,stemmed_tokens,tokenized_text_without_stopwords
0,"[explan, why, edit, usernam, hardcor, metallic...","[explanation, why, edits, username, hardcore, ..."
1,"[aww, he, match, background, colour, seemingli...","[aww, he, matches, background, colour, seeming..."
2,"[hei, man, try, edit, war, it, gui, constantli...","[hey, man, trying, edit, war, it, guy, constan..."
3,"[more, can, real, suggest, improv, wonder, sec...","[more, can, real, suggestions, improvement, wo..."
4,"[you, sir, hero, ani, chanc, rememb, page, tha...","[you, sir, hero, any, chance, remember, page, ..."
5,"[congratul, well, us, tool, well, talk]","[congratulations, well, use, tools, well, talk]"
6,"[cocksuck, befor, you, piss, around, on, my, w...","[cocksucker, before, you, piss, around, on, my..."
7,"[your, vandal, matt, shirvington, articl, reve...","[your, vandalism, matt, shirvington, article, ..."
8,"[sorri, word, nonsens, offens, you, anywai, in...","[sorry, word, nonsense, offensive, you, anyway..."
9,"[align, subject, contrari, dulithgow]","[alignment, subject, contrary, dulithgow]"


# Split Dataset into Train and Test

In [6]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
# Train Test Split Function
def split_train_test(df_train, test_size=0.3, shuffle_state=True):
    # 1. Split first
    X = df_train.drop(columns=['result'])
    y = df_train['result']
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=15, shuffle=True)

    # 2. Resample only the training set
    ros = RandomOverSampler(random_state=42)
    X_train_resampled, Y_train_resampled = ros.fit_resample(X_train, Y_train)
    print("Value counts for Train sentiments before resampling")
    print(Y_train.value_counts())
    print("Value counts for Train Resampled sentiments")
    print(Y_train_resampled.value_counts())
    print("Value counts for Test sentiments")
    print(Y_test.value_counts())
    print(type(X_train_resampled))
    print(type(Y_train_resampled))
    X_train_resampled = X_train_resampled.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    Y_train_resampled = Y_train_resampled.reset_index(drop=True)
    Y_test = Y_test.reset_index(drop=True)
    print(X_train_resampled.head())
    return X_train_resampled, X_test, Y_train_resampled, Y_test 

# Call the train_test_split
X_train, X_test, Y_train, Y_test = split_train_test(df_train)

X_train['stemmed_tokens'].tail(60)

Value counts for Train sentiments before resampling
result
0    100315
1     11384
Name: count, dtype: int64
Value counts for Train Resampled sentiments
result
0    100315
1    100315
Name: count, dtype: int64
Value counts for Test sentiments
result
0    43031
1     4841
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
                 id                                       comment_text  toxic  \
0  52772c567777757c  and being a female you are acting so cruel whi...      0   
2  828d5c69c2f283a0  "\n\nNoitall. Like I have said all I am trying...      0   
3  6161150e0239aaff  "\nOh, look how sweet he is. And what happened...      0   
4  176fc566031ab49a  "\n\n Alleged Moon Landing \n\nUntil it's prov...      0   

   severe_toxic  obscene  threat  insult  identity_hate  \
0             0        0       0       0              0   
1             0        0       0       0              0   
2             0        0       0       0     

200570                       [cunt, cunt, cunt, cunt, cunt]
200571    [what, dumbass, slimvirgin, which, make, reali...
200572    [rape, kati, perri, peni, night, dildo, fun, s...
200573                          [hi, wanna, suck, ur, cock]
200574    [februari, utc, perhap, quit, remov, comment, ...
200575    [and, sourc, suffici, reliabl, accus, amount, ...
200576              [want, an, answer, you, son, of, bitch]
200577                                                   []
200578                         [onorem, is, still, faugott]
200579    [reckon, stand, fred, phelp, never, stand, hom...
200580                   [fuck, you, re, drunk, son, bitch]
200581    [flai, aliv, fking, stalker, shall, enjoi, scr...
200582    [you, gayreek, don, open, mouth, stink, sperm,...
200583    [import, note, charl, vandal, troll, malcont, ...
200584    [guyzero, bite, me, go, ahead, and, block, the...
200585    [rick, the, dick, check, out, the, common, pag...
200586    [anim, feel, free, eras, becau

# Train Word2Vec Model

In [8]:
from gensim.models import Word2Vec
import time
# Skip-gram model (sg = 1)
size = 500
window = 3
min_count = 1
workers = 8
sg = 1

OUTPUT_FOLDER = 'data/'

word2vec_model_file = OUTPUT_FOLDER + 'word2vec_' + str(size) + '.model'
start_time = time.time()
stemmed_tokens = pd.Series(df_train['stemmed_tokens']).values
# Train the Word2Vec Model
w2v_model = Word2Vec(stemmed_tokens, min_count = min_count, vector_size = size, workers = workers, window = window, sg = sg)
print("Time taken to train word2vec model: " + str(time.time() - start_time))
w2v_model.save(word2vec_model_file)

Time taken to train word2vec model: 29.058432817459106


# Load Model

In [9]:
import numpy as np
from gensim.models import Word2Vec

# Load the model from the model file
sg_w2v_model = Word2Vec.load(word2vec_model_file)

# Unique ID of the word
print("Index of the word 'action':")
print(sg_w2v_model.wv.get_index("action"))

print("Vector generated for the word 'action':")
print(sg_w2v_model.wv["action"])

# Total Number of the words
print("Total number of unique words in the vocabulary:")
print(len(sg_w2v_model.wv.index_to_key))

# Print the size of the word2vec vector for one word
print("Length of the vector generated for a word")
print(sg_w2v_model.vector_size)
# Get the mean for the vectors for an example review
# print("Print the length after taking average of all word vectors in a sentence:")
print(np.mean([sg_w2v_model.wv[token] for token in df_train['stemmed_tokens'][0]], axis=0))

Index of the word 'action':
311
Vector generated for the word 'action':
[-7.02268556e-02  2.64004111e-01  3.49453501e-02  8.36413875e-02
 -1.04161307e-01 -3.36617976e-01  3.51412803e-01  3.34654212e-01
  1.80916145e-01 -9.83745530e-02  2.29382336e-01 -1.83328748e-01
  2.50296026e-01 -4.66472208e-02  6.54150844e-02 -4.82346527e-02
  7.50428587e-02 -2.22390518e-01 -1.73915878e-01 -1.66600481e-01
  4.00683843e-03 -2.67991781e-01 -1.91588089e-01 -2.29298130e-01
  9.22734737e-02 -3.40974003e-01  2.24473774e-02  1.63289700e-02
 -2.97569901e-01  2.46230006e-01  1.34298891e-01  1.36645615e-01
 -2.29113802e-01  8.81111547e-02 -1.41383726e-02 -5.15145436e-02
  1.14522144e-01 -3.69274765e-01 -8.27937052e-02 -1.35492876e-01
 -4.29192364e-01 -8.82483050e-02 -6.59475505e-01  1.59609213e-01
 -1.00657500e-01 -6.68454766e-02  5.95811792e-02 -1.47030249e-01
 -1.77758053e-01 -1.89619020e-01 -5.32316081e-02  7.62736946e-02
 -8.84717479e-02  7.46044703e-03 -1.23321831e-01  8.97745565e-02
 -5.47409318e-02 -

# We now have the Word2Vec model trained on our dataset. We can load this model whenever we want to use it for generating word vectors for our text data.

We can make use of our X_train and X_test datasets to create the Word2Vec representations for our training and testing data respectively. We will create two separate CSV files to store these representations.

In [10]:
# Store the vectors for train data in following file
word2vec_filename = OUTPUT_FOLDER + 'train_review_word2vec.csv'
# with open(word2vec_filename, 'w+') as word2vec_file:
#     for index, row in X_train.iterrows():
#         model_vector = (np.mean([sg_w2v_model.wv[token] for token in row['stemmed_tokens']], axis=0)).tolist()
#         if index == 0:
#             header = ",".join(str(ele) for ele in range(size))
#             word2vec_file.write(header)
#             word2vec_file.write("\n")
#         # Check if the line exists else it is vector of zeros
#         if type(model_vector) is list:  
#             line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
#         else:
#             line1 = ",".join([str(0) for i in range(size)])
#         word2vec_file.write(line1)
#         word2vec_file.write('\n')

## Train Sentiment Classifier via Logistic Regression

We'll use scikit-learn's `LogisticRegression` for fast, interpretable binary classification.

In [11]:
from sklearn.linear_model import LogisticRegression
import time

# Load the training data from the CSV file
word2vec_df = pd.read_csv(OUTPUT_FOLDER + 'train_review_word2vec.csv')

print("Shape of the word2vec training data:", word2vec_df.shape)
print("Shape of Y_train:", Y_train.shape)

# Initialize Logistic Regression with parallel jobs
logreg = LogisticRegression(max_iter=200, n_jobs=-1, random_state=42, class_weight="balanced")

start_time = time.time()
logreg.fit(word2vec_df, Y_train)
print("Time taken to train Logistic Regression: " + str(time.time() - start_time))

Shape of the word2vec training data: (200630, 500)
Shape of Y_train: (200630,)
Time taken to train Logistic Regression: 4.107938051223755


In [12]:
# Evaluate Logistic Regression on test set
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd

test_features_word2vec = []
vec_size = sg_w2v_model.vector_size  # should be 500

for _, row in X_test.iterrows():
    vectors = [sg_w2v_model.wv[t] for t in row["stemmed_tokens"] if t in
sg_w2v_model.wv]
    test_features_word2vec.append(np.mean(vectors, axis=0) if vectors else
np.zeros(vec_size))

test_features_word2vec = pd.DataFrame(test_features_word2vec,
columns=word2vec_df.columns)
test_predictions_logreg = logreg.predict(test_features_word2vec)
print(classification_report(Y_test, test_predictions_logreg, zero_division=0))
# # Evaluate Logistic Regression on test set
# from sklearn.metrics import classification_report

# test_features_word2vec = []
# for index, row in X_test.iterrows():
#     model_vector = np.mean([sg_w2v_model.wv[token] for token in row['stemmed_tokens']], axis=0)
#     if type(model_vector) is list:
#         test_features_word2vec.append(model_vector)
#     else:
#         test_features_word2vec.append(np.array([0 for i in range(500)]))

# test_features_word2vec = pd.DataFrame(test_features_word2vec, columns=word2vec_df.columns)
# test_predictions_logreg = logreg.predict(test_features_word2vec)
# print(classification_report(Y_test, test_predictions_logreg))

              precision    recall  f1-score   support

           0       0.99      0.63      0.77     43031
           1       0.23      0.96      0.37      4841

    accuracy                           0.67     47872
   macro avg       0.61      0.80      0.57     47872
weighted avg       0.92      0.67      0.73     47872



# Train Sentiment Classifier via Random Forest

In [None]:
# import time
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import classification_report, accuracy_score

# rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1)
# # Load the training data from the CSV file
# word2vec_df = pd.read_csv(OUTPUT_FOLDER + 'train_review_word2vec.csv')

# start_time = time.time()
# # Fit the Random Forest Classifier
# rf_classifier.fit(word2vec_df, Y_train)
# print("Time taken to train Random Forest Classifier: " + str(time.time() - start_time))

Time taken to train Random Forest Classifier: 36.495548486709595


In [None]:
# from sklearn.metrics import classification_report
# test_features_word2vec = []
# for index, row in X_test.iterrows():
#     model_vector = np.mean([sg_w2v_model.wv[token] for token in row['stemmed_tokens']], axis=0)
#     if type(model_vector) is list:
#         test_features_word2vec.append(model_vector)
#     else:
#         test_features_word2vec.append(np.array([0 for i in range(size)]))
# test_predictions_word2vec = rf_classifier.predict(test_features_word2vec)
# print(classification_report(Y_test,test_predictions_word2vec))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


              precision    recall  f1-score   support

           0       0.00      0.00      0.00     43031
           1       0.10      1.00      0.18      4841

    accuracy                           0.10     47872
   macro avg       0.05      0.50      0.09     47872
weighted avg       0.01      0.10      0.02     47872



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


# Try with Decision Tree Classifier

In [None]:
# import time
# #Import the DecisionTreeeClassifier
# from sklearn.tree import DecisionTreeClassifier
# # Load from the filename
# word2vec_df = pd.read_csv(word2vec_filename)
# #Initialize the model
# clf_decision_word2vec = DecisionTreeClassifier()

# start_time = time.time()
# # Fit the model
# clf_decision_word2vec.fit(word2vec_df, Y_train)
# print("Time taken to fit the model with word2vec vectors: " + str(time.time() - start_time))

In [None]:
# from sklearn.metrics import classification_report
# test_features_word2vec = []
# for index, row in X_test.iterrows():
#     model_vector = np.mean([sg_w2v_model[token] for token in row['stemmed_tokens']], axis=0)
#     if type(model_vector) is list:
#         test_features_word2vec.append(model_vector)
#     else:
#         test_features_word2vec.append(np.array([0 for i in range(500)]))
# test_predictions_word2vec = clf_decision_word2vec.predict(test_features_word2vec)
# print(classification_report(Y_test['sentiment'],test_predictions_word2vec))