# DATA PREPROCESSING

In [63]:
import pandas as pd
import re
import nltk
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from abbreviations import abbreviations


# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Loading Dataset


In [64]:
def load_data(filepath):
    """
    Load the dataset from a CSV file.

    """
    df = pd.read_csv(filepath)
    print("Initial Data (First 5 rows):")
    print(df.head())
    return df

input_filepath = 'labeled_data.csv' 
df = load_data(input_filepath)

Initial Data (First 5 rows):
   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  


In [65]:
# Custom stop words list from NLTK
stop_words = set(stopwords.words('english'))


# Function to expand abbreviations
def expand_abbreviations(text):
    words = text.split()
    expanded_words = [abbreviations.get(word, word) for word in words if word != "rt"]
    return ' '.join(expanded_words)

### PREPROCESSING
    - removing rows with missing values and reset index
    

In [66]:
def preprocess_data(df):
    """
    Preprocess the dataset by performing basic cleaning.

    """
    # Remove any rows with missing values
    df.dropna(inplace=True)
    # Reset the index after dropping rows
    df.reset_index(drop=True, inplace=True)
    print("Data after preprocessing (First 5 rows):")
    print(df.head())
    return df

df = preprocess_data(df)

Data after preprocessing (First 5 rows):
   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  


### CLEANING
    - remove urls, usernames, special characters, and extra spaces
    -lowercasing
    -stopwords removal
    -lemmatization
    

In [67]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove usernames
    text = re.sub(r'@\w+', '', text)
    # Remove tweets starting with hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove special characters, numbers, punctuations
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def clean_data(df, text_column):

    df[text_column] = df[text_column].apply(clean_text)
    # Remove rows where the 'tweet' column is empty
    df = df[df[text_column].str.strip() != ""]
    # Reset the index after dropping rows
    df.reset_index(drop=True, inplace=True)
    return df

text_column = 'tweet'  
df = clean_data(df, text_column)


def further_clean_text(text):
    text = text.lower()
    text = expand_abbreviations(text)
    words = text.split()
    cleaned_words = [WordNetLemmatizer().lemmatize(word) for word in words if word not in stop_words and len(word) > 1]
    

    return ' '.join(cleaned_words)

def further_clean_data(df, text_column):
 
    df[text_column] = df[text_column].apply(further_clean_text)
    print("Data after further cleaning (First 5 rows):")
    print(df.head())
    return df


df = further_clean_data(df, text_column)

Data after further cleaning (First 5 rows):
   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  woman shouldnt complain cleaning house man alw...  
1           boy dat coldtyga bad cuffin hoe st place  
2       dawg ever fuck bitch start cry confused shit  
3                                   look like tranny  
4        shit hear might true might faker bitch told  


### Expanding Abbreviation


In [68]:
def expand_data(df, text_column):

    df[text_column] = df[text_column].apply(expand_abbreviations)
    print("Data after expanding abbreviations (First 5 rows):")
    print(df.head())
    return df

df = expand_data(df, text_column)

Data after expanding abbreviations (First 5 rows):
   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  woman shouldnt complain cleaning house man alw...  
1          boy that coldtyga bad cuffin hoe st place  
2       dawg ever fuck bitch start cry confused shit  
3                                   look like tranny  
4        shit hear might true might faker bitch told  


### SAVING THE CLEANED DATA

In [69]:
def save_data(df, output_filepath):
    df.to_csv(output_filepath, index=False)

output_filepath = 'cleaned_data.csv' 
save_data(df, output_filepath)


# ML MODEL TRAINING
    -after testing out with various encoding method TF-IDF is found to be giving best results, so it is used here
    
    - SMOTE is used for handling imbalances

    - Conducted hyper parameter tuning on lr, svm and rf.
    - svm was more consistent and accurate
    -so svm is selected as model
    

In [70]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
import joblib

In [71]:
# Load the cleaned data
df = pd.read_csv('../Data-Preprocessing/cleaned_data.csv')
text_column = 'tweet'

# Ensure the text column is of string type
df[text_column] = df[text_column].astype(str)


In [72]:
# Split the data into features (X) and labels (y)

X = df[text_column]
y = df['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=4000)  # Adjust max_features as needed

# Fit and transform the training data, then transform the test data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("Shape of TF-IDF matrix for training data:", X_train_tfidf.shape)
print("Shape of TF-IDF matrix for test data:", X_test_tfidf.shape)

Shape of TF-IDF matrix for training data: (19812, 4000)
Shape of TF-IDF matrix for test data: (4954, 4000)


In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [74]:
tfidf_vectorizer = TfidfVectorizer(max_features=4000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [75]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_tfidf, y_train)
print(f"After SMOTE, the number of samples in each class:\n{y_train_smote.value_counts()}")

After SMOTE, the number of samples in each class:
class
1    15384
0    15384
2    15384
Name: count, dtype: int64


In [76]:
best_params_svm = {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
model = SVC(**best_params_svm)
model.fit(X_train_smote, y_train_smote)

In [77]:
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Model: SVM SMOTE")
print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n {conf_matrix}")

# Cell 8: Save the Best Model
joblib.dump(model, "SVM_SMOTE.joblib")

# Cell 9: Save the TF-IDF Vectorizer
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.joblib")

Model: SVM SMOTE
Accuracy: 0.8861526039563988
Confusion Matrix:
 [[  46  209   27]
 [  50 3645  103]
 [   6  169  699]]


['tfidf_vectorizer.joblib']