In [1]:
# Basic Python package
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import string 
import re

In [2]:
# File paths
train_file = r"C:\Users\prits\Downloads\Data\cleaned_ghc_train.csv"
test_file = r'C:\Users\prits\Downloads\Data\cleaned_ghc_test.csv'

# Load CSV files into pandas DataFrames
train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

In [3]:
print(train_df)

                                                    text  hd  cv  vo label
0      he most likely converted to islam due to his n...   0   0   0     h
1      so ford lied about being a psychologist record...   0   0   0     h
2      jobs education ending abuse of nation californ...   0   0   0    nh
3      i share a lot of your values  like many who do...   0   0   0     h
4      i am so ready to get back to blogging  recipes...   0   0   0    nh
...                                                  ...  ..  ..  ..   ...
21771  im a fan of western civilization and one bedro...   0   0   0     h
21772  or  is she saying that muslims dont know how t...   0   0   0     h
21773  thank you to all my followers that follow me e...   0   0   0     h
21774                                   wednesday music    0   0   0    nh
21775                    this is a really big surprise     0   0   0     h

[21776 rows x 5 columns]


In [4]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [5]:
#word tokenisation
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

train_df['text'] = train_df['text'].astype(str)
# Download the necessary NLTK data files (only need to do this once)
nltk.download('punkt')

# Tokenize each text in the DataFrame
train_df['tokens'] = train_df['text'].apply(word_tokenize)

# Print the DataFrame with tokens
print(train_df)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prits\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                                    text  hd  cv  vo label  \
0      he most likely converted to islam due to his n...   0   0   0     h   
1      so ford lied about being a psychologist record...   0   0   0     h   
2      jobs education ending abuse of nation californ...   0   0   0    nh   
3      i share a lot of your values  like many who do...   0   0   0     h   
4      i am so ready to get back to blogging  recipes...   0   0   0    nh   
...                                                  ...  ..  ..  ..   ...   
21771  im a fan of western civilization and one bedro...   0   0   0     h   
21772  or  is she saying that muslims dont know how t...   0   0   0     h   
21773  thank you to all my followers that follow me e...   0   0   0     h   
21774                                   wednesday music    0   0   0    nh   
21775                    this is a really big surprise     0   0   0     h   

                                                  tokens  
0   

TF-IDF stands for Term Frequency Inverse Document Frequency of records. It can be defined as the calculation of how relevant a word in a series or corpus is to a text. The meaning increases proportionally to the number of times in the text a word appears but is compensated by the word frequency in the corpus (data-set).

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
text = train_df['text'].tolist()
label = train_df['label'].tolist()

def tfidf_embedding(text):
    vectorizer = TfidfVectorizer()
    embeddings = vectorizer.fit_transform(text)
    return embeddings

embeddings_tfidf = tfidf_embedding(text)
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(embeddings_tfidf, label, test_size=0.2, random_state=42)


print(embeddings_tfidf)
print(embeddings_tfidf.shape)

  (0, 24880)	0.31710913639027155
  (0, 33748)	0.12467565088586496
  (0, 20751)	0.29949577210948686
  (0, 24714)	0.31179399174676026
  (0, 9590)	0.29949577210948686
  (0, 16715)	0.2388844980949804
  (0, 12447)	0.09547613390817968
  (0, 30263)	0.3310896501311571
  (0, 3841)	0.1626644731914859
  (0, 21095)	0.2508446739516041
  (0, 14985)	0.14531281938493537
  (0, 10012)	0.22911410847152885
  (0, 16704)	0.20776951929183204
  (0, 31641)	0.14079737944241671
  (0, 7356)	0.31710913639027155
  (0, 18500)	0.23549026573540505
  (0, 20598)	0.17471219660660267
  (0, 14575)	0.13770558967852184
  (1, 34475)	0.2195790341458713
  (1, 21519)	0.16041445402598734
  (1, 30069)	0.2967681272991259
  (1, 17319)	0.1530944619562374
  (1, 28127)	0.1991329040848775
  (1, 16045)	0.375163973280694
  (1, 27694)	0.2761296673148862
  :	:
  (21773, 22552)	0.21671870343794195
  (21773, 20868)	0.13371450055069928
  (21773, 16784)	0.07049797403608575
  (21773, 3706)	0.23748991109226655
  (21773, 21176)	0.11602063728705314

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

def train_evaluate_rf(X_train_emb, X_test_emb, y_train, y_test):
    rf = RandomForestClassifier()
    rf.fit(X_train_emb, y_train)
    y_pred = rf.predict(X_test_emb)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return accuracy, report

In [10]:
accuracy_tfidf, report_tfidf = train_evaluate_rf(X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf)
print(f'TF-IDF Accuracy: {accuracy_tfidf}')
print(f'TF-IDF Classification Report:\n{report_tfidf}')

TF-IDF Accuracy: 0.720615243342516
TF-IDF Classification Report:
              precision    recall  f1-score   support

           h       0.78      0.19      0.30      1407
          nh       0.72      0.97      0.83      2949

    accuracy                           0.72      4356
   macro avg       0.75      0.58      0.57      4356
weighted avg       0.74      0.72      0.66      4356



Word2Vec is an effort to map words to high-dimensional vectors to capture the semantic relationships between words.Words with similar meanings should have similar vector representations, according to the main principle of Word2Vec.

In [11]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize


# Extract texts from the training dataframe
texts_train = train_df['text'].values

# Handle any NaN values in the text column
texts_train = np.where(pd.isnull(texts_train), '', texts_train)

# Tokenize the texts
tokenized_train = [word_tokenize(text) for text in texts_train]

# Train a Word2Vec model on the training texts
w2v_model = Word2Vec(sentences=tokenized_train, vector_size=100, window=5, min_count=1, workers=4)

# Function to create average Word2Vec embeddings for each text
def get_average_word2vec(tokens_list, model, vocabulary, num_features):
    feature_vec = np.zeros((num_features,), dtype="float32")
    n_words = 0
    for token in tokens_list:
        if token in vocabulary:
            n_words += 1
            feature_vec = np.add(feature_vec, model.wv[token])
    if n_words > 0:
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

# Create embeddings for the training texts
vocabulary = set(w2v_model.wv.index_to_key)
embeddings_train = np.array([get_average_word2vec(tokens, w2v_model, vocabulary, 100) for tokens in tokenized_train])

print(embeddings_train)
# Split the embeddings and labels into training and testing sets (80% train, 20% test)
X_train_w2v2, X_test_w2v2, y_train_w2v2, y_test_w2v2 = train_test_split(embeddings_train, label, test_size=0.2, random_state=42)


[[-0.04996056  0.3125958  -0.10564645 ... -0.36408773 -0.11938274
  -0.25820625]
 [-0.15601371  0.3611218  -0.04419656 ... -0.28170615 -0.12261149
  -0.41080427]
 [-0.10266089  0.341702   -0.04805031 ... -0.19066872  0.06783475
  -0.10286985]
 ...
 [-0.29624313  0.6246821   0.17197324 ... -0.3617536  -0.04014009
  -0.28844056]
 [-0.04265551  0.3442541  -0.10895921 ... -0.37010425 -0.01953383
  -0.16869847]
 [-0.43342534  0.43683624 -0.14489152 ... -0.5334764  -0.06727689
  -0.47319642]]


In [12]:

accuracy_w2v2, report_w2v2 = train_evaluate_rf(X_train_w2v2, X_test_w2v2, y_train_w2v2, y_test_w2v2)
print(f'Word2Vec Accuracy: {accuracy_w2v2}')


Word2Vec Accuracy: 0.8266758494031221


OneHotEncoding:One hot encoding is a technique that we use to represent categorical variables as numerical values

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

df = train_df
# Initialize OneHotEncoder
encoder = OneHotEncoder()

# Encode categorical columns
encoded_data = encoder.fit_transform(df[['hd', 'cv', 'vo', 'label']])

# Convert the encoded data into a DataFrame with appropriate column names
encoded_df = pd.DataFrame(encoded_data.toarray(), columns=encoder.get_feature_names(['hd', 'cv', 'vo', 'label']))

# Concatenate the original 'text' column with the encoded DataFrame
final_df = pd.concat([df[['text']], encoded_df], axis=1)
print(final_df)

# Assuming 'label' column is the target variable
X = final_df.drop(columns=['text'])  # Features
y = final_df[['label_h', 'label_nh']]  # Target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Predict labels for test set
y_pred = rf_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"OneHot accuracy: {accuracy:.2f}")



                                                    text  hd_0  hd_1  cv_0  \
0      he most likely converted to islam due to his n...   1.0   0.0   1.0   
1      so ford lied about being a psychologist record...   1.0   0.0   1.0   
2      jobs education ending abuse of nation californ...   1.0   0.0   1.0   
3      i share a lot of your values  like many who do...   1.0   0.0   1.0   
4      i am so ready to get back to blogging  recipes...   1.0   0.0   1.0   
...                                                  ...   ...   ...   ...   
21771  im a fan of western civilization and one bedro...   1.0   0.0   1.0   
21772  or  is she saying that muslims dont know how t...   1.0   0.0   1.0   
21773  thank you to all my followers that follow me e...   1.0   0.0   1.0   
21774                                   wednesday music    1.0   0.0   1.0   
21775                    this is a really big surprise     1.0   0.0   1.0   

       cv_1  vo_0  vo_1  label_h  label_nh  
0       0.0   1.0 

Label Encoding:Label Encoding is a technique that is used to convert categorical columns into numerical ones so that they can be fitted by machine learning models which only take numerical data

In [14]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

# Fit the encoder on the categorical features
encoded_data = label_encoder.fit_transform(train_df[['label']])

print(encoded_data)

[0 0 1 ... 0 1 0]


  y = column_or_1d(y, warn=True)
