In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from scipy import stats
import re
import nltk

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Load the TSV files
train_file_path = r'F:\moVies\Springboard\data\ghc_train.tsv'
test_file_path = r'F:\moVies\Springboard\data\ghc_test.tsv'
train_df = pd.read_csv(train_file_path, sep='\t')
test_df = pd.read_csv(test_file_path, sep='\t')

# Display initial data summary
print("Initial Train DataFrame Info:")
print(train_df.info())
print(train_df.describe())
print("Initial Test DataFrame Info:")
print(test_df.info())
print(test_df.describe())

# Text preprocessing function
def preprocess_text(text):
    # Convert text to lowercase for consistency
    text = text.lower()
    # Remove punctuation and numbers to clean the text
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    # Remove stopwords and lemmatize words to reduce noise
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply text preprocessing
train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)

# Define high-frequency stereotypical words list for hate speech detection
stereotypical_words = ['word1', 'word2', 'word3']  # Add actual high-frequency stereotypical words here

# Function to detect hate speech based on the provided definitions
def detect_hate_speech(text):
    # Initialize variables to track hate-based rhetoric
    hate_based_rhetoric = {
        'HD': False,  # Hate-Based Derogatory Language
        'CV': False,  # Calls for Violence
        'VO': False,  # Vulgarity/Offensive Language directed at an individual
        'SXO': False, # Sexual Orientation
        'RAE': False, # Racial or Ethnicity-based
        'EX': False   # Expressions of Hate
    }
    
    # Check for high-frequency stereotypical words
    for word in stereotypical_words:
        if word in text:
            hate_based_rhetoric['HD'] = True
            
    # Check for unnecessary labeling (example: "a Jew", "a Muslim", etc.)
    if re.search(r'\ba\s+\w+\b', text):
        hate_based_rhetoric['HD'] = True
    
    # Check for other hate-based rhetoric
    if re.search(r'\b(deported|thrown off a roof)\b', text):
        hate_based_rhetoric['CV'] = True
    if re.search(r'\b(muzzie)\b', text):
        hate_based_rhetoric['VO'] = True
    if re.search(r'\b(sexual orientation)\b', text):
        hate_based_rhetoric['SXO'] = True
    if re.search(r'\b(black|Muslim|middle easterner|africans)\b', text):
        hate_based_rhetoric['RAE'] = True
    if re.search(r'\b(hate)\b', text):
        hate_based_rhetoric['EX'] = True
    
    # Classify the text based on the presence of hate-based rhetoric
    if any(hate_based_rhetoric.values()):
        return 'h'  # Hate speech detected
    else:
        return 'nh' # Not hateful

# Apply text classification for hate speech detection
train_df['label'] = train_df['text'].apply(detect_hate_speech)
test_df['label'] = test_df['text'].apply(detect_hate_speech)

# Display data after classification
print("Data after classification:")
print(train_df[['text', 'label']].head())
print(test_df[['text', 'label']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Initial Train DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22036 entries, 0 to 22035
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    22036 non-null  object
 1   hd      22036 non-null  int64 
 2   cv      22036 non-null  int64 
 3   vo      22036 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 688.8+ KB
None
                 hd            cv            vo
count  22036.000000  22036.000000  22036.000000
mean       0.084271      0.005945      0.062579
std        0.277800      0.076875      0.242210
min        0.000000      0.000000      0.000000
25%        0.000000      0.000000      0.000000
50%        0.000000      0.000000      0.000000
75%        0.000000      0.000000      0.000000
max        1.000000      1.000000      1.000000
Initial Test DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5510 entries, 0 to 5509
Data columns (total 4 columns):
 #   Column  Non-Null 