In [40]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the TSV files
train_file_path = r'C:\Users\prits\Downloads\Data\ghc_train.tsv'
test_file_path = r'C:\Users\prits\Downloads\Data\ghc_test.tsv'

train_df = pd.read_csv(train_file_path, sep='\t')
test_df = pd.read_csv(test_file_path, sep='\t')

# Display the first few rows of the data
print("Train DataFrame Head:")
display(train_df.head())
print("Test DataFrame Head:")
display(test_df.head())

# Inspect the data
print("Train DataFrame Info:")
train_df.info()
print("Train DataFrame Description:")
display(train_df.describe())

print("Test DataFrame Info:")
test_df.info()
print("Test DataFrame Description:")
display(test_df.describe())

# Handle missing values
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)
# Or fill missing values (example: fill with mean for numerical columns)
# train_df.fillna(train_df.mean(), inplace=True)
# test_df.fillna(test_df.mean(), inplace=True)



Train DataFrame Head:


Unnamed: 0,text,hd,cv,vo
0,He most likely converted to islam due to his n...,0,0,0
1,So Ford lied about being a psychologist. Recor...,0,0,0
2,Jobs. Education. Ending abuse of Nation. CA43.,0,0,0
3,"I share a lot of your values, & like many who ...",0,0,0
4,I am so ready to get back to blogging! www.ben...,0,0,0


Test DataFrame Head:


Unnamed: 0,text,hd,cv,vo
0,https://www.youtube.com/watch?v=kACWpKAKtak A ...,0,0,0
1,Very nice! I tend to get tired of the constant...,0,0,0
2,Watch today. https://circumcisionmovie.com/,0,0,0
3,""" Thinking Venues "" First Color Layer blocking...",0,0,0
4,What about death penalty for perpetrators and...,0,0,0


Train DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22036 entries, 0 to 22035
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    22036 non-null  object
 1   hd      22036 non-null  int64 
 2   cv      22036 non-null  int64 
 3   vo      22036 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 688.8+ KB
Train DataFrame Description:


Unnamed: 0,hd,cv,vo
count,22036.0,22036.0,22036.0
mean,0.084271,0.005945,0.062579
std,0.2778,0.076875,0.24221
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,1.0,1.0,1.0


Test DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5510 entries, 0 to 5509
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5510 non-null   object
 1   hd      5510 non-null   int64 
 2   cv      5510 non-null   int64 
 3   vo      5510 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 172.3+ KB
Test DataFrame Description:


Unnamed: 0,hd,cv,vo
count,5510.0,5510.0,5510.0
mean,0.089111,0.004356,0.066969
std,0.284929,0.06586,0.249991
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,1.0,1.0,1.0


# REMOVING URL

In [41]:

def clean_data(dataframe):
#replace URL of a text
    test_df['text'] = test_df['text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')

clean_data(test_df)
print(test_df['text']);


0                                  A talk on natural law.
1       Very nice! I tend to get tired of the constant...
2                                          Watch today.  
3       " Thinking Venues " First Color Layer blocking...
4       What about death penalty for perpetrators  and...
                              ...                        
5505    Trump To "Counter" DNC Lawsuit; Seeks Servers,...
5506    i guess eu is gonna have to back track a littl...
5507    A good read here....         Well worth a few ...
5508    The only way to change things is to have compa...
5509    And Tel Aviv, Jerusalem, New York, LA, Berlin,...
Name: text, Length: 5510, dtype: object


  test_df['text'] = test_df['text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')


In [42]:
def clean_data(dataframe):
#replace URL of a text
    train_df['text'] = train_df['text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')

clean_data(train_df)
print(train_df['text']);

0        He most likely converted to islam due to his n...
1        So Ford lied about being a psychologist. Recor...
2           Jobs. Education. Ending abuse of Nation. CA43.
3        I share a lot of your values, & like many who ...
4        I am so ready to get back to blogging! www.ben...
                               ...                        
22031    I'm a fan of western civilization, and one bed...
22032    Or ... is she saying that Muslims don't know h...
22033    Thank you to all my followers that follow me e...
22034                                   Wednesday music.  
22035                    This is a really Big Surprise!   
Name: text, Length: 22036, dtype: object


  train_df['text'] = train_df['text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')


# Abbreviation Treatment 

In [43]:
import re
import pandas as pd

# Dictionary of abbreviations and their full forms
abbreviations = {
    'u': 'you',
    'r': 'are',
    'omg': 'oh my god',
    'lol': 'laugh out loud',
    'brb': 'be right back',
    'idk': 'I don’t know',
    'tbh': 'to be honest',
    'btw': 'by the way',
    'afaik': 'as far as I know',
    'bbl': 'be back later',
    'bfn': 'bye for now',
    'bff': 'best friends forever',
    'cya': 'see you',
    'ftw': 'for the win',
    'fyi': 'for your information',
    'gtg': 'got to go',
    'imo': 'in my opinion',
    'imho': 'in my humble opinion',
    'irl': 'in real life',
    'jk': 'just kidding',
    'lmao': 'laughing my ass off',
    'lmk': 'let me know',
    'nvm': 'never mind',
    'omw': 'on my way',
    'rofl': 'rolling on the floor laughing',
    'smh': 'shaking my head',
    'tba': 'to be announced',
    'tbd': 'to be decided',
    'ttyl': 'talk to you later',
    'txt': 'text',
    'w/e': 'whatever',
    'w/o': 'without',
    'w8': 'wait',
    'yolo': 'you only live once',
    'plz': 'please',
    'thx': 'thanks',
    'xoxo': 'hugs and kisses',
    'NATO':'North Atlantic Treaty Organization.'
    # Add more abbreviations as needed
}

def chat_word(text):
    new_text=[]
    for word in text.split():
        if word.upper() in abbreviations :
            new_text.append(abbreviations [word.upper()])
        else:
            new_text.append(word)
            
    return " ".join(new_text)

test_df['text']=test_df['text'].apply(chat_word)
test_df.head()


Unnamed: 0,text,hd,cv,vo
0,A talk on natural law.,0,0,0
1,Very nice! I tend to get tired of the constant...,0,0,0
2,Watch today.,0,0,0
3,""" Thinking Venues "" First Color Layer blocking...",0,0,0
4,What about death penalty for perpetrators and ...,0,0,0


In [44]:
train_df['text']=train_df['text'].apply(chat_word)
train_df.head()

Unnamed: 0,text,hd,cv,vo
0,He most likely converted to islam due to his n...,0,0,0
1,So Ford lied about being a psychologist. Recor...,0,0,0
2,Jobs. Education. Ending abuse of Nation. CA43.,0,0,0
3,"I share a lot of your values, & like many who ...",0,0,0
4,I am so ready to get back to blogging! www.ben...,0,0,0


In [45]:

specific_text = test_df.loc[19, 'text']
print(specific_text )


At this weeks North Atlantic Treaty Organization. meeting while Sajjan and Freeland are applauding the speaker, Justin just has no idea what's going on.


# removing unwanted tags such as @icareviews

In [46]:
import pandas as pd
import re

def remove_pattern(text):
    # Define the pattern to match
    pattern = r'@\w+\b'  # Matches "@" followed by one or more word characters (\w+)

    # Use the sub() function from the re module to replace matched patterns with an empty string
    cleaned_text = re.sub(pattern, '', text)

    return cleaned_text



test_df['text'] = test_df['text'].apply(remove_pattern)
test_df.head()


Unnamed: 0,text,hd,cv,vo
0,A talk on natural law.,0,0,0
1,Very nice! I tend to get tired of the constant...,0,0,0
2,Watch today.,0,0,0
3,""" Thinking Venues "" First Color Layer blocking...",0,0,0
4,What about death penalty for perpetrators and ...,0,0,0


In [47]:
train_df['text'] = train_df['text'].apply(remove_pattern)
print(train_df)

                                                    text  hd  cv  vo
0      He most likely converted to islam due to his n...   0   0   0
1      So Ford lied about being a psychologist. Recor...   0   0   0
2         Jobs. Education. Ending abuse of Nation. CA43.   0   0   0
3      I share a lot of your values, & like many who ...   0   0   0
4      I am so ready to get back to blogging! www.ben...   0   0   0
...                                                  ...  ..  ..  ..
22031  I'm a fan of western civilization, and one bed...   0   0   0
22032  Or ... is she saying that Muslims don't know h...   0   0   0
22033  Thank you to all my followers that follow me e...   0   0   0
22034                                   Wednesday music.   0   0   0
22035                     This is a really Big Surprise!   0   0   0

[22036 rows x 4 columns]


In [48]:
specific_text = test_df.loc[72, 'text']
print(specific_text) 

 I just wanted to say I really liked all the artwork you posted in the Gab Draws section. You have a keen eye for art!


# labelling the text type

In [49]:
import pandas as pd

def label_content(text):
    # Define lists of inappropriate words and phrases
    inappropriate_keywords = [
        'fuck', 'shit', 'nigger', 'slut', 'bitch', 'pussy', 'dick', 'asshole', 'faggot', 
        'cunt', 'bastard', 'whore', 'rape', 'pedophile', 'nazi'
        # Add more keywords as needed
    ]
    
    # Convert text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Check if any inappropriate keywords are in the text
    if any(keyword in text_lower for keyword in inappropriate_keywords):
        return 'Inappropriate'
    else:
        return 'Appropriate'

# Apply the label_content function to the 'text' column
test_df['label'] = test_df['text'].apply(label_content)

# Display the DataFrame
print(test_df.head())


    

                                                text  hd  cv  vo        label
0                             A talk on natural law.   0   0   0  Appropriate
1  Very nice! I tend to get tired of the constant...   0   0   0  Appropriate
2                                       Watch today.   0   0   0  Appropriate
3  " Thinking Venues " First Color Layer blocking...   0   0   0  Appropriate
4  What about death penalty for perpetrators and ...   0   0   0  Appropriate


In [50]:
train_df['label'] = train_df['text'].apply(label_content)

# Display the DataFrame
print(train_df.head())

                                                text  hd  cv  vo        label
0  He most likely converted to islam due to his n...   0   0   0  Appropriate
1  So Ford lied about being a psychologist. Recor...   0   0   0  Appropriate
2     Jobs. Education. Ending abuse of Nation. CA43.   0   0   0  Appropriate
3  I share a lot of your values, & like many who ...   0   0   0  Appropriate
4  I am so ready to get back to blogging! www.ben...   0   0   0  Appropriate
