<a href="https://colab.research.google.com/github/shivamanisuram/complete-pandas/blob/main/TEXT_DATA_PREPROCESSING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
#printing the stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

data preprocessing

In [7]:
# Original code
# news_dataset = pd.read_csv('/content/fake_news_dataset.csv')

# Modified code with error handling
news_dataset = pd.read_csv('/content/fake_news_dataset.csv',
                           on_bad_lines='skip', # Skips lines with parsing errors
                           engine='python',     # Uses the Python parser instead of the C parser
                           quotechar='"',       # Explicitly defines the quote character
                           escapechar='\\')      # Explicitly defines the escape character

In [8]:
news_dataset.shape

(19721, 5)

In [11]:
news_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1.0
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0.0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1.0
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1.0
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1.0


In [12]:
news_dataset.isnull().sum()

Unnamed: 0,0
id,0
title,531
author,1878
text,54
label,16


In [13]:
#replace misiing values with nullstring
news_dataset = news_dataset.fillna('')

In [14]:
#merging news title and author name
news_dataset['content'] = news_dataset['title']+' '+news_dataset['author']

In [15]:
news_dataset.head()

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1.0,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0.0,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1.0,Why the Truth Might Get You Fired Consortiumne...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1.0,15 Civilians Killed In Single US Airstrike Hav...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1.0,Iranian woman jailed for fictional unpublished...


In [17]:
#sepearing features and target

In [18]:
x=news_dataset.drop(columns='label',axis=1)
y=news_dataset['label']

In [19]:
print(x)

          id                                              title  \
0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2          2                  Why the Truth Might Get You Fired   
3          3  15 Civilians Killed In Single US Airstrike Hav...   
4          4  Iranian woman jailed for fictional unpublished...   
...      ...                                                ...   
19716  19702  Re: The Far Left Is Planning The Biggest Polit...   
19717  19703  Brooks Koepka Wins First Major at U.S. Open - ...   
19718  19704     100 Notable Books of 2016 - The New York Times   
19719  19705  Lessons of Syria: Russia to Form a 'Superlight...   
19720  19706  If Clinton Campaign Believes WikiLeaks Emails ...   

                   author                                               text  \
0           Darrell Lucus  House Dem Aide: We Didn’t Even See Comey’s Let...   
1         Daniel J. Flynn  Ever get

In [20]:
print(y)

0        1.0
1        0.0
2        1.0
3        1.0
4        1.0
        ... 
19716    1.0
19717    0.0
19718    0.0
19719    1.0
19720    1.0
Name: label, Length: 19721, dtype: object


In [21]:
#stemming is the process of reducing a word to itskeyboard
port_stemmer = PorterStemmer()

In [24]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stemmer.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content
#

In [26]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

TypeError: expected string or bytes-like object

In [27]:
def stemming(content):
    # Convert 'content' to string to ensure re.sub works correctly
    content = str(content)
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stemmer.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [28]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [29]:
print(news_dataset['content'])

0        none
1        none
2        none
3        none
4        none
         ... 
19716    none
19717    none
19718    none
19719    none
19720    none
Name: content, Length: 19721, dtype: object


In [32]:
x=news_dataset['content'].values
y=news_dataset['label'].values

In [33]:
print(x)

['none' 'none' 'none' ... 'none' 'none' 'none']


In [34]:
print(y)

[1.0 0.0 1.0 ... 0.0 1.0 1.0]


In [35]:
y.shape

(19721,)

In [36]:
vectorizer=TfidfVectorizer()
vectorizer.fit(x)
x=vectorizer.transform(x)

In [37]:
print(x)

  (0, 0)	1.0
  (1, 0)	1.0
  (2, 0)	1.0
  (3, 0)	1.0
  (4, 0)	1.0
  (5, 0)	1.0
  (6, 0)	1.0
  (7, 0)	1.0
  (8, 0)	1.0
  (9, 0)	1.0
  (10, 0)	1.0
  (11, 0)	1.0
  (12, 0)	1.0
  (13, 0)	1.0
  (14, 0)	1.0
  (15, 0)	1.0
  (16, 0)	1.0
  (17, 0)	1.0
  (18, 0)	1.0
  (19, 0)	1.0
  (20, 0)	1.0
  (21, 0)	1.0
  (22, 0)	1.0
  (23, 0)	1.0
  (24, 0)	1.0
  :	:
  (19696, 0)	1.0
  (19697, 0)	1.0
  (19698, 0)	1.0
  (19699, 0)	1.0
  (19700, 0)	1.0
  (19701, 0)	1.0
  (19702, 0)	1.0
  (19703, 0)	1.0
  (19704, 0)	1.0
  (19705, 0)	1.0
  (19706, 0)	1.0
  (19707, 0)	1.0
  (19708, 0)	1.0
  (19709, 0)	1.0
  (19710, 0)	1.0
  (19711, 0)	1.0
  (19712, 0)	1.0
  (19713, 0)	1.0
  (19714, 0)	1.0
  (19715, 0)	1.0
  (19716, 0)	1.0
  (19717, 0)	1.0
  (19718, 0)	1.0
  (19719, 0)	1.0
  (19720, 0)	1.0
