In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder


In [3]:
data = pd.read_csv('Preprocessed_Final_dataset.csv',encoding = 'latin1')
data.head()

Unnamed: 0,comment,label,clean_text,tokens,text
0,dalits are lowlives,N,dalits are lowlives,"['dalits', 'lowlives']",dalits lowlives
1,gay people are a burden to our society,N,gay people are a burden to our society,"['gay', 'people', 'burden', 'society']",gay people burden society
2,Arabs are not welcome,N,arabs are not welcome,"['arab', 'welcome']",arab welcome
3,I'm not saying we should actually eliminate he...,N,i am not saying we should actually eliminate h...,"['say', 'actually', 'eliminate', 'heebs', 'wis...",say actually eliminate heebs wish naturally be...
4,bananas are for black people,N,bananas are for black people,"['bananas', 'black', 'people']",bananas black people


# TF-IDF Vectorizer

### TF-IDF is the importance of a term is inversely related to its frequency across documents.TF gives us information on how often a term appears in a document and IDF gives us information about the relative rarity of a term in the collection of documents. By multiplying these values together we can get our final TF-IDF value.The higher the TF-IDF score the more important or relevant the term is; as a term gets less relevant, its TF-IDF score will approach 0.

In [4]:
# Drop rows with NaN values in the 'clean_text' column
data = data.dropna(subset=['text'])

# Remove leading and trailing whitespace from the 'clean_text' column
data['text'] = data['text'].str.strip()

In [5]:
# Transform text to TF-IDF features with a limit on max features
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X = tfidf_vectorizer.fit_transform(data['text'])

# Label Encoding:

### LabelEncoder converts the categorical label into numeric labels.This is useful for classification tasks where the target variable needs to be in numerical format.

In [6]:

# Encode the labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['label'])
tfidf_df = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df['label'] = y


In [7]:
print(tfidf_df.head())

   able  absolute  absolutely  abuse  accept  accord  account  across  act  \
0   0.0       0.0         0.0    0.0     0.0     0.0      0.0     0.0  0.0   
1   0.0       0.0         0.0    0.0     0.0     0.0      0.0     0.0  0.0   
2   0.0       0.0         0.0    0.0     0.0     0.0      0.0     0.0  0.0   
3   0.0       0.0         0.0    0.0     0.0     0.0      0.0     0.0  0.0   
4   0.0       0.0         0.0    0.0     0.0     0.0      0.0     0.0  0.0   

   action  ...  would  wow  write  wrong  yeah  year  yes  yet  young  label  
0     0.0  ...    0.0  0.0    0.0    0.0   0.0   0.0  0.0  0.0    0.0      0  
1     0.0  ...    0.0  0.0    0.0    0.0   0.0   0.0  0.0  0.0    0.0      0  
2     0.0  ...    0.0  0.0    0.0    0.0   0.0   0.0  0.0  0.0    0.0      0  
3     0.0  ...    0.0  0.0    0.0    0.0   0.0   0.0  0.0  0.0    0.0      0  
4     0.0  ...    0.0  0.0    0.0    0.0   0.0   0.0  0.0  0.0    0.0      0  

[5 rows x 1001 columns]


In [8]:
data.to_csv('tf_idf.csv', index=False)

# Logistic Regression

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
logistic_regression_model = LogisticRegression(max_iter=1000)

# Train the model
logistic_regression_model.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = logistic_regression_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Print the classification report
report = classification_report(y_test, y_pred)
print(f"Classification Report:\n{report}")

Accuracy: 0.6639493855700207
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.71      0.70      4466
           1       0.64      0.61      0.62      3753

    accuracy                           0.66      8219
   macro avg       0.66      0.66      0.66      8219
weighted avg       0.66      0.66      0.66      8219



# SVM Model

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the SVM model
svm_model = SVC(kernel='linear', max_iter=1000)

# Train the model
svm_model.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = svm_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Print the classification report
report = classification_report(y_test, y_pred)
print(f"Classification Report:\n{report}")



Accuracy: 0.5584621000121669
Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.56      0.58      4466
           1       0.52      0.55      0.53      3753

    accuracy                           0.56      8219
   macro avg       0.56      0.56      0.56      8219
weighted avg       0.56      0.56      0.56      8219



# Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest model
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
random_forest_model.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = random_forest_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Print the classification report
report = classification_report(y_test, y_pred)
print(f"Classification Report:\n{report}")