In [1]:
import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
# Load the dataset
df = pd.read_csv('/content/cleaned_dataset_combined (2).csv')

# Display the first few rows to verify
print("Original DataFrame:")
print(df.head())

Original DataFrame:
   hate_speech  offensive_language  neither  class  \
0            0                   0        3      2   
1            0                   3        0      1   
2            0                   3        0      1   
3            0                   2        1      1   
4            0                   6        0      1   

                                               tweet  \
0  retwet as a woman you should not complain abou...   
1  retwet boy dats coldtyga dwn bad for cufin dat...   
2  retwet dawg retwet you ever fuck a bitch and s...   
3                        retwet she lok like a trany   
4  retwet the shit you hear about me might be tru...   

                                        tweet_tokens  
0  ['retwet', 'woman', 'complain', 'cleaning', 'h...  
1  ['retwet', 'boy', 'dat', 'coldtyga', 'dwn', 'b...  
2  ['retwet', 'dawg', 'retwet', 'ever', 'fuck', '...  
3                 ['retwet', 'lok', 'like', 'trany']  
4  ['retwet', 'shit', 'hear', 'might', 'tru

In [3]:
# Convert string representation of list to actual list
df['tweet_tokens'] = df['tweet_tokens'].apply(ast.literal_eval)

# Display the first few rows to verify
print("\nDataFrame after converting tweet_tokens to lists:")
print(df.head())


DataFrame after converting tweet_tokens to lists:
   hate_speech  offensive_language  neither  class  \
0            0                   0        3      2   
1            0                   3        0      1   
2            0                   3        0      1   
3            0                   2        1      1   
4            0                   6        0      1   

                                               tweet  \
0  retwet as a woman you should not complain abou...   
1  retwet boy dats coldtyga dwn bad for cufin dat...   
2  retwet dawg retwet you ever fuck a bitch and s...   
3                        retwet she lok like a trany   
4  retwet the shit you hear about me might be tru...   

                                        tweet_tokens  
0  [retwet, woman, complain, cleaning, house, man...  
1  [retwet, boy, dat, coldtyga, dwn, bad, cufin, ...  
2  [retwet, dawg, retwet, ever, fuck, bitch, star...  
3                         [retwet, lok, like, trany]  
4  [retwet, 

In [4]:
# Define feature and target variables
X = df[['tweet_tokens']]
y = df['class']

# Define a function to join tokens into a single string
X['tweet_tokens'] = X['tweet_tokens'].apply(lambda x: ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['tweet_tokens'] = X['tweet_tokens'].apply(lambda x: ' '.join(x))


In [5]:
# Use TfidfVectorizer for the 'tweet_tokens' column
column_transformer = ColumnTransformer(
    transformers=[
        ('tweet_tokens', TfidfVectorizer(), 'tweet_tokens')
    ],
    remainder='passthrough'  # This keeps the other columns as is
)

In [6]:
# Create a Random Forest classifier pipeline
pipeline = make_pipeline(column_transformer, RandomForestClassifier(random_state=42))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the training data
print("\nTraining Data:")
print(X_train.head())
print(y_train.head())





Training Data:
                                            tweet_tokens
15272  retwet wel else wil white people get u forget ...
9351   funy thing isit people people see pic judge bi...
20323  retwet niga mesed wrong bitch loudlycryingface...
3638                                          bitch niga
20579                                         real bitch
15272    0
9351     2
20323    1
3638     1
20579    1
Name: class, dtype: int64


In [7]:
# Display the testing data
print("\nTesting Data:")
print(X_test.head())
print(y_test.head())


Testing Data:
                                            tweet_tokens
2281                              got mised cal yo bitch
15914       retwet fuck bad bitch go ned money lil homie
18943  retwet laugh loud credit near god know right m...
16407  retwet wipe cum fagot retwet contact lens wild...
13326  nigas cheat bitch expect pay back whatsoever a...
2281     1
15914    1
18943    2
16407    1
13326    1
Name: class, dtype: int64


In [8]:
# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
accuracy = pipeline.score(X_test, y_test)
print(f'\nAccuracy: {accuracy}')

# Predict the test set results
y_pred = pipeline.predict(X_test)

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.8846076255799878

Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.14      0.22       290
           1       0.91      0.96      0.93      3832
           2       0.81      0.79      0.80       835

    accuracy                           0.88      4957
   macro avg       0.74      0.63      0.65      4957
weighted avg       0.87      0.88      0.87      4957

