In [2]:
import pandas as pd 

In [3]:
df = pd.read_csv("WELFake_Dataset.csv")

In [4]:
df.isnull().sum()

Unnamed: 0      0
title         558
text           39
label           0
dtype: int64

In [5]:
df.dropna(inplace = True)

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
5,5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1


In [7]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [8]:
df.head()

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1


In [9]:
df.reset_index(inplace = True)
df.drop(['index'], axis = 1, inplace = True)

In [10]:
df.head()

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1


In [11]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure stopwords and lemmatizer are downloaded
nltk.download('stopwords')
nltk.download('wordnet')

def clean_text(text):
    # 1. Convert text to lowercase
    text = text.lower()
    
    # 2. Remove HTML tags (if any)
    text = re.sub(r'<.*?>', '', text)
    
    # 3. Remove special characters, numbers, and keep only alphabets and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # 4. Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # 5. Remove stop words
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # 6. Lemmatization (to get the root form of words)
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
df['text'] = df['text'].apply(clean_text)

In [13]:
df.drop(['title'], axis=1, inplace=True)

In [14]:
df = df.sample(frac = 1)
df.head()
df.reset_index(inplace = True)
df.drop(['index'], axis = 1, inplace = True)
df.head()

Unnamed: 0,text,label
0,tesla world safest car explodes like bomb dail...,1
1,google pinterest digg linkedin reddit stumbleu...,1
2,,1
3,london reuters british prime minister theresa ...,0
4,donald trump told audience fresno california f...,1


In [15]:
df.duplicated().sum()

9600

In [16]:
print(f"Number of rows before removing duplicates: {df.shape[0]}")

# Remove duplicate rows
df = df.drop_duplicates()

# Verify the changes
print(f"Number of rows after removing duplicates: {df.shape[0]}")

Number of rows before removing duplicates: 71537
Number of rows after removing duplicates: 61937


In [17]:
df.duplicated().sum()

0

In [18]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [19]:
df.head()

Unnamed: 0,text,label
0,tesla world safest car explodes like bomb dail...,1
1,google pinterest digg linkedin reddit stumbleu...,1
2,,1
3,london reuters british prime minister theresa ...,0
4,donald trump told audience fresno california f...,1


In [20]:
# Handle missing values
df['text'] = df['text'].fillna("")

# Convert all text data to string type
df['text'] = df['text'].astype(str)

In [21]:
df.shape

(61937, 2)

In [25]:
df = df.sample(frac = 1)
df.head()
df.reset_index(inplace = True)
df.drop(['index'], axis = 1, inplace = True)
df.head()

Unnamed: 0,text,label
0,donald trump leader left world defending weste...,0
1,washington reuters u senator ben cardin top de...,0
2,near future north korea regime brink collapse ...,0
3,hysterical evidence available american voter h...,1
4,medium certain extent pollster got seriously w...,1


In [26]:
df.to_csv('WELFake_Dataset_Preprocessed.csv', index=False)

In [27]:
df.shape

(61937, 2)

In [28]:
df1 = pd.read_csv('WELFake_Dataset_Preprocessed.csv')

In [29]:
df1.head()

Unnamed: 0,text,label
0,donald trump leader left world defending weste...,0
1,washington reuters u senator ben cardin top de...,0
2,near future north korea regime brink collapse ...,0
3,hysterical evidence available american voter h...,1
4,medium certain extent pollster got seriously w...,1


In [30]:
df1.shape

(61937, 2)