## Import Libraries and Load Data

In [59]:
#general packages for data manipulation
import os
import pandas as pd
import numpy as np
#visualizations
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#consistent sized plot 
from pylab import rcParams
rcParams['figure.figsize']=12,5
rcParams['axes.labelsize']=12
rcParams['xtick.labelsize']=12
rcParams['ytick.labelsize']=12
#handle the warnings in the code
import warnings
warnings.filterwarnings(action='ignore',category=DeprecationWarning)
warnings.filterwarnings(action='ignore',category=FutureWarning)
#text preprocessing libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
#import texthero
#import texthero as hero
#regular expressions
import re
#display pandas dataframe columns 
pd.options.display.max_columns = None

load csv file as pandas dataframe

In [60]:
data = pd.read_csv(r'C:\VSCode\NLP4B_Football\Own_model\labeled_normalized_data.csv')

#drop the first column because it is not necessary
data.drop(['Unnamed: 0'], axis=1, inplace=True)
print(data.head())
print(data.shape)

#copy to a new dataframe
df = data.copy()

                                             comment  source  label
0  As a woman you shouldn't complain about cleani...  kaggle      0
1  boy dats cold...tyga dwn bad for cuffin dat ho...  kaggle      1
2  Dawg!!!! You ever fuck a bitch and she start t...  kaggle      1
3  The shit you hear about me might be true or it...  kaggle      1
4  The shit just blows me..claim you so faithful ...  kaggle      1
(2398, 3)


# Text Cleaning

Find things to remove and how often they appear

In [61]:
# Function to check for pattern
def check_for_pattern(regex, dataframe, column_name):
    '''Function to check for how often a pattern appears in a dataframe column and returns a list of all the items found'''
    pattern = re.compile(regex)
    result = []
    for i in range(len(dataframe[column_name])):
        phrase = (re.findall(pattern, dataframe[column_name][i]))
        if phrase != []:
            result.append(phrase[0])
    print("Total items found:", len(result), result)
    return result

# Check for user handles
user_handles = check_for_pattern(r'@[\w]*', df, 'comment')

# Check for hashtags
hashtags = check_for_pattern(r'#[\w]*', df, 'comment')

# Check for URLs
urls = check_for_pattern(r'https?://[A-Za-z0-9./]+', df, 'comment')

# Check for punctuations
punctuations = check_for_pattern(r'[^\w\s]', df, 'comment')

# Check for numbers 18, 88, 1312
numbers = check_for_pattern(r'18|88|1312', df, 'comment')
#print rows with numbers 18, 88, 1312, show only the comment column and show the whole comment
print(numbers)
# --> no need to worry



Total items found: 5 ['@soccerboy_04', '@bluprint_4', '@KingCuh', '@WestYourMajesty', '@']
Total items found: 49 ['#Shots', '#2MW', '#HappyHumpDay', '#Eaglesnation', '#EarlyChristmas', '#CowboysNation', '#TehGodClan', '#Yankees', '#FreeMoneyMelle', '#oomf', '#scally', '#fixed', '#KingOfTheHill', '#UCFPINKPARTY', '#bum', '#hoesaintloyal', '#real', '#Pisces', '#GerrysHalloweenParty', '#MTVHottest', '#Yankees', '#ProtectTheAnimals', '#Damn', '#', '#blondeproblems', '#scally', '#History', '#frenchscally', '#YoureNotMyType', '#shitmybosssays', '#shitallysays', '#FreshRhymes', '#128514', '#JT2020Tour', '#50centmovie', '#ThankYouPaulForConfirmingLarry', '#NottingHill', '#8230', '#233', '#8230', '#RIPTALLT', '#65292', '#Tupac', '#afterearth', '#SNL', '#hoes', '#ShitFahdSays', '#redskins', '#1']
Total items found: 2 ['https://x.com/dfb', 'https://youtu.be/8dIQ56YACvE']
Total items found: 1829 ["'", '.', '!', '.', '.', ':', '.', '?', '.', "'", "'", '.', '#', '#', ',', '#', "'", '"', '.', "'", '"

Remove these patterns

In [62]:

# Remove user handles
df.replace(r'@[\w]*', '', regex=True, inplace=True)

# Remove hashtags
df.replace(r'#[\w]*', '', regex=True, inplace=True)

# Remove URLs
df.replace(r'https?://[A-Za-z0-9./]+', '', regex=True, inplace=True)

# Remove punctuations
df.replace(r'[^\w\s]', '', regex=True, inplace=True)

# Remove digits
df.replace(r'\d+', '', regex=True, inplace=True)


# show random 5 rows
df.sample(5, random_state=1)

Unnamed: 0,comment,source,label
1347,I meanhow good is Bellingham Crazy watching hi...,youtube,0
468,Stop being a pussy son and shove that needle i...,kaggle,1
1462,Unbelievable penalty given How on earth did th...,youtube,0
2265,Id like to reaffirm that Rice has been the be...,reddit,0
943,How often does Darren Fletcher say And theres ...,youtube,0


Decapitalize

In [63]:
df["comment"] = df["comment"].str.lower()

Tokenize

In [64]:
#tokenize using Tokenizer
tokenizer = WordPunctTokenizer()
df['comment'] = df['comment'].apply(tokenizer.tokenize)

#show random 5 rows
df.sample(5, random_state=2)


Unnamed: 0,comment,source,label
1649,"[shouldve, played, at, least, one, academy, pl...",youtube,0
1387,"[i, know, utd, blundered, a, goal, lead, but, ...",youtube,1
1766,"[a, lot, of, nigeria, fans, at, wembley]",youtube,0
1805,"[welbeck, and, ali, should, be, in, nigeria, t...",youtube,0
1861,"[awakenbeerus, is, a, true, sports, savant, hi...",youtube,0


Remove Stopwords

In [65]:

#remove stopwords
#nltk.download()
stopwords = stopwords.words('english')
df['comment'] = df['comment'].apply(lambda x: [item for item in x if item not in stopwords])

#show random 5 rows
df.sample(5, random_state=1)

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


Unnamed: 0,comment,source,label
1347,"[meanhow, good, bellingham, crazy, watching, l...",youtube,0
468,"[stop, pussy, son, shove, needle, heart]",kaggle,1
1462,"[unbelievable, penalty, given, earth, ref, give]",youtube,0
2265,"[id, like, reaffirm, rice, best, player, pitch]",reddit,0
943,"[often, darren, fletcher, say, theres, chance]",youtube,0


Spelling Corrections

In [66]:
from textblob import TextBlob

# apply spelling corrections on dataframe
def correct_spelling(tokens):
    textblob = TextBlob(' '.join(tokens))
    corrected_text = textblob.correct()
    return corrected_text.split()

# Apply the correction function to the 'comment' column
df['comment'] = df['comment'].apply(lambda tokens: correct_spelling(tokens))

KeyboardInterrupt: 

Remove special characters

In [None]:
def rem_nonalpha(text):
    '''Function to remove the non-alphanumeric characters from the text'''
    text = [word for word in text if word.isalpha()]
    return text

# Apply the function to the 'comment' column
df['comment'] = df['comment'].apply(rem_nonalpha(x))

Check for data balance

In [None]:
sns.countplot(df['label'])
plt.title('Count of Hate vs Non Hate Tweet')
plt.grid()
plt.show()