In [None]:
import pandas as pd

# Load dataset
train_df = pd.read_csv('/content/train_E6oV3lV.csv')

# Display first few rows
train_df.head()


Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [None]:
# Check for missing values
print(train_df.isnull().sum())

# Fill missing values using different techniques
train_df['tweet'].fillna("No text available", inplace=True)  # Example replacement for missing tweets


In [None]:
import numpy as np

# Define a function to remove outliers
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Example (if applicable numeric column exists)
# train_df = remove_outliers(train_df, 'column_name')


In [12]:
from scipy import stats
train_df['tweet_length'] = train_df['tweet'].apply(lambda x: len(x.split()))
train_df = train_df[(np.abs(stats.zscore(train_df['tweet_length'])) < 3)]


In [None]:
import re

def clean_text(text):
    text = re.sub(r'@\w+', '', text)  # Remove @mentions
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

train_df['cleaned_tweet'] = train_df['tweet'].apply(clean_text)
train_df.head()


Unnamed: 0,id,label,tweet,cleaned_tweet
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit i cant use cause they d...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in ur
4,5,0,factsguide: society now #motivation,factsguide society now motivation


In [None]:
!pip install -U spacy
!python -m spacy download en_core_web_sm  # Download small English model


Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy

# Load English NLP model
nlp = spacy.load("en_core_web_sm")


In [None]:
import re

def preprocess_text(text):
    # Remove user mentions (@username)
    text = re.sub(r'@\w+', '', text)

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Convert to lowercase
    text = text.lower()

    # Process text with spaCy NLP model
    doc = nlp(text)

    # Lemmatization & Stopword Removal
    processed_tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]

    # Join tokens back into a string
    return " ".join(processed_tokens)

# Example usage
sample_text = "@user This is an example of #hate speech detection using spaCy! Visit http://example.com"
processed_text = preprocess_text(sample_text)
print(processed_text)


example hate speech detection spacy visit


In [None]:
train_df['processed_tweet'] = train_df['tweet'].apply(preprocess_text)
train_df.head()


Unnamed: 0,id,label,tweet,cleaned_tweet,processed_tweet
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...,father dysfunctional selfish drag kid dysfunct...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit i cant use cause they d...,thank lyft credit use cause offer wheelchair v...
2,3,0,bihday your majesty,bihday your majesty,bihday majesty
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in ur,model love u u time
4,5,0,factsguide: society now #motivation,factsguide society now motivation,factsguide society motivation


In [9]:
train_df.to_csv('/content/processed_train.csv', index=False)


In [10]:
train_df['label'].value_counts(normalize=True)


Unnamed: 0_level_0,proportion
label,Unnamed: 1_level_1
0,0.929854
1,0.070146


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(train_df['processed_tweet'])


In [14]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(max_features=5000, stop_words='english')
X_count = count_vectorizer.fit_transform(train_df['processed_tweet'])
