In [2]:
#Importing the dependencies for the project
import numpy as np      
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import scipy.sparse

In [3]:
#nltk stands for NATURAL LANGUAGE TOOL KIT
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/raahiltekriwal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# printing the stopwords in English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

DATA PRE-PROCESSING

In [6]:
#FAKE_NEWS DATASET
dataset= pd.read_csv('fake-news/train.csv')
datasetf=pd.read_csv('fake-news/test.csv')
sub=pd.read_csv('fake-news/submit.csv')
# Extract the last column from 'sub'
label = sub.columns[-1]
last_column_sub = sub.pop(label)
# Insert the extracted column into 'test'
datasetf[label] = last_column_sub
Fake_news_dataset = pd.concat([dataset, datasetf], axis=0, sort=False)
# counting the number of missing values in the dataset
Fake_news_dataset.isnull().sum()
# replacing the null values with empty string
Fake_news_dataset = Fake_news_dataset.fillna('')
# merging the author name and news title
Fake_news_dataset['content'] = Fake_news_dataset['author']+' '+Fake_news_dataset['title']
Fake_news_dataset.pop('title')
Fake_news_dataset.pop('author')
Fake_news_dataset.rename(columns={"content": "title"}, inplace=True)
ff=Fake_news_dataset.pop('title')
#Fake_news_dataset[1]=ff
Fake_news_dataset.insert(1,'title',ff)


#Playing with fake or real news
fake_real_ds=pd.read_csv('fake_or_real_news.csv')

#WELFAKE_DATASET
welfake_ds=pd.read_csv('WELFake_Dataset.csv')


In [7]:
print("Shape of fake-news-dataset=",Fake_news_dataset.shape)
print("Shape of fake or real news dataset=",fake_real_ds.shape)
print("Shape of welfake dataset=",welfake_ds.shape)


Shape of fake-news-dataset= (26000, 4)
Shape of fake or real news dataset= (6335, 4)
Shape of welfake dataset= (72134, 4)


In [8]:
Fake_news_dataset.head(2)

Unnamed: 0,id,title,text,label
0,0,Darrell Lucus House Dem Aide: We Didn’t Even S...,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...",Ever get the feeling your life circles the rou...,0


In [9]:
fake_real_ds.rename(columns={"unnamed:0":"id"})
fake_real_ds.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [10]:
welfake_ds.head(2)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1


Converting multiclass labels to two classes - Fake and True for exploratory data analysis

In [11]:
liar_dataset_train = pd.read_csv('liar_dataset/train.csv', delimiter='\t', quoting=3, header=None)
liar_dataset_test = pd.read_csv('liar_dataset/test.csv', delimiter='\t', quoting=3, header=None)
liar_dataset_valid = pd.read_csv('liar_dataset/valid.csv', delimiter='\t', quoting=3, header=None)
df_raw = pd.concat([liar_dataset_train, liar_dataset_test, liar_dataset_valid], axis=0, sort=False)
df_raw = df_raw.sample(frac=1).reset_index()
# naming the columns of the dataset
df_raw.columns=["index","ID", "label", "statement", "subject", "speaker", "job", "state", "party", "barely_true_cts",
        "false_cts", "half_true_cts", "mostly_true_cts", "pants_on_fire_cts", "context"]

In [12]:
#function for mapping labels "true, mostly-true, half-true" to TRUE and "false, barely-true, pants-fire" to FAKE.
def binary_class_dataset(data):
    data['title'] = data['speaker']+' '+data['subject']+' '+data['context']
    data.rename(columns={"index":"id"},inplace=True)
    data = data.iloc[:, [0,2, 3,15]]
    #data.insert(1,'title',data_title)
    data.columns = ['id','label', 'text', 'title']
    data_label=data.pop('label')
    data_stmt=data.pop('text')
    data_title=data.pop('title')
    data.insert(1,'title',data_title)
    data.insert(2,'text',data_stmt)
    data.insert(3,'label',data_label)
    Original_labels = {
        'true': 'True',
        'mostly-true': 'True',
        'half-true': 'True',
        'false': 'Fake',
        'barely-true': 'Fake',
        'pants-fire': 'Fake'
    }
    data['label'] = data['label'].map(Original_labels)

    return data

In [13]:
# running the function on the loaded dataframe
bi_class= binary_class_dataset(df_raw)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['label'] = data['label'].map(Original_labels)


In [14]:
liar_dataset_test.shape

(1283, 14)

In [15]:
bi_class.shape

(12836, 4)

In [16]:
# counting the number of missing values in the dataset
Fake_news_dataset.isnull().sum()
fake_real_ds.isnull().sum()
welfake_ds.isnull().sum()
bi_class.isnull().sum()
# replacing the null values with empty string
Fake_news_dataset = Fake_news_dataset.fillna('')
fake_real_ds = fake_real_ds.fillna('')
welfake_ds=welfake_ds.fillna('')
bi_class=bi_class.fillna('')



Stemming is the process of reducing a word to its Root word
example: actor, actress, acting --> act

In [17]:
port_stem = PorterStemmer()

In [18]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [19]:
Fake_news_dataset['text'] = Fake_news_dataset['text'].apply(stemming)
Fake_news_dataset['title'] = Fake_news_dataset['title'].apply(stemming)
fake_real_ds['text']=fake_real_ds['text'].apply(stemming)
fake_real_ds['title']=fake_real_ds['title'].apply(stemming)
welfake_ds['text']=welfake_ds['text'].apply(stemming)
welfake_ds['title']=welfake_ds['title'].apply(stemming)
bi_class['text']=bi_class['text'].apply(stemming)
bi_class['title']=bi_class['title'].apply(stemming)

In [20]:
Fake_news_dataset.shape

(26000, 4)

In [21]:
bi_class.shape

(12836, 4)

In [22]:
fake_real_ds.shape

(6335, 4)

In [23]:
new_columns = ['id', 'title', 'text']
unlabeled_data = pd.DataFrame(columns=new_columns)
#new_row_data = Fake_news_dataset.iloc[int(0.3 * len(Fake_news_dataset)), :2]
unlabeled_data = Fake_news_dataset[['id', 'title', 'text']].iloc[-int(0.7 * len(Fake_news_dataset)):]
unlabeled_data=unlabeled_data._append(fake_real_ds.iloc[-int(0.7 * len(fake_real_ds)):, :-1].reset_index(drop=True))
unlabeled_data = unlabeled_data.reset_index(drop=True)
if 'Unnamed: 0' in unlabeled_data.columns:
    unlabeled_data = unlabeled_data.drop('Unnamed: 0', axis=1)
#unlabeled_data=unlabeled_data.append(new_row_data,ignore_index=True)
#unlabeled_data=unlabeled_data.append(fake_real_ds.iloc[0.5*len(fake_real_ds):,:],ignore_index=True)
unlabeled_data=unlabeled_data._append(welfake_ds.iloc[-int(0.7*len(welfake_ds)):,:-1].reset_index(drop=True))
unlabeled_data = unlabeled_data.reset_index(drop=True)
if 'Unnamed: 0' in unlabeled_data.columns:
    unlabeled_data = unlabeled_data.drop('Unnamed: 0', axis=1)
unlabeled_data=unlabeled_data._append(bi_class.iloc[-int(0.7*len(bi_class)):,:-1].reset_index(drop=True))
unlabeled_data = unlabeled_data.reset_index(drop=True)
if 'Unnamed: 0' in unlabeled_data.columns:
    unlabeled_data = unlabeled_data.drop('Unnamed: 0', axis=1)



In [24]:
print(type(unlabeled_data))

<class 'pandas.core.frame.DataFrame'>


In [25]:
unlabeled_data.shape

(82112, 3)

In [26]:
unlabeled_data.head(2)

Unnamed: 0,id,title,text
0,7800.0,nathaniel popper wall st regul propos stricter...,regul releas rule thursday morn aim restrict b...
1,7801.0,luca nolan mark zuckerberg call univers basic ...,facebook ceo mark zuckerberg close commenc spe...


In [27]:
#dataset_Tfid=pd.DataFrame
#dataset_Tfid.columns['id','title','text','label']
#dataset_Tfid=dataset_Tfid.append(Fake_news_dataset.iloc[:0.3*len(Fake_news_dataset),:],ignore_index=True)
#dataset_Tfid=dataset_Tfid.append(fake_real_ds.iloc[:0.5*len(fake_real_ds),:],ignore_index=True)
#dataset_Tfid=dataset_Tfid.append(welfake_ds.iloc[:0.3*len(welfake_ds),:],ignore_index=True)
#dataset_Tfid=dataset_Tfid.append(bi_class.iloc[:0.3*len(bi_class),:],ignore_index=True)


column = ['id', 'title', 'text','label']
dataset_Tfid = pd.DataFrame(columns=column)
#new_row_data = Fake_news_dataset.iloc[int(0.3 * len(Fake_news_dataset)), :2]
dataset_Tfid = Fake_news_dataset[['id', 'title', 'text','label']].iloc[:int(0.3 * len(Fake_news_dataset)):]
dataset_Tfid=dataset_Tfid._append(fake_real_ds.iloc[:int(0.3 * len(fake_real_ds)):, :].reset_index(drop=True))
dataset_Tfid = dataset_Tfid.reset_index(drop=True)
if 'Unnamed: 0' in dataset_Tfid.columns:
    dataset_Tfid = dataset_Tfid.drop('Unnamed: 0', axis=1)
dataset_Tfid=dataset_Tfid._append(welfake_ds.iloc[:int(0.3*len(welfake_ds)):,:].reset_index(drop=True))
dataset_Tfid = dataset_Tfid.reset_index(drop=True)
if 'Unnamed: 0' in dataset_Tfid.columns:
    dataset_Tfid = dataset_Tfid.drop('Unnamed: 0', axis=1)
dataset_Tfid=dataset_Tfid._append(bi_class.iloc[:int(0.3*len(bi_class)):,:].reset_index(drop=True))
dataset_Tfid = dataset_Tfid.reset_index(drop=True)
if 'Unnamed: 0' in dataset_Tfid.columns:
    dataset_Tfid = dataset_Tfid.drop('Unnamed: 0', axis=1)
col=['label']
Y=pd.DataFrame(columns=col)
#Y=dataset_Tfid.pop('label')
dataset_Tfid.pop('id')


#dataset_Tfid=Fake_news_dataset.iloc[:,[0,1,2]]
# converting the textual data to numerical data
#vectorizer = TfidfVectorizer()
#vectorizer.fit(dataset_Tfid)

#dataset_Tfid = vectorizer.transform(dataset_Tfid)

0           0.0
1           1.0
2           2.0
3           3.0
4           4.0
          ...  
35185     456.0
35186    1001.0
35187    1353.0
35188    7619.0
35189     246.0
Name: id, Length: 35190, dtype: float64

In [28]:
dataset_Tfid.shape

(35190, 3)

In [29]:
dataset_Tfid.head()

Unnamed: 0,title,text,label
0,darrel lucu hous dem aid even see comey letter...,hous dem aid even see comey letter jason chaff...,1
1,daniel j flynn flynn hillari clinton big woman...,ever get feel life circl roundabout rather hea...,0
2,consortiumnew com truth might get fire,truth might get fire octob tension intellig an...,1
3,jessica purkiss civilian kill singl us airstri...,video civilian kill singl us airstrik identifi...,1
4,howard portnoy iranian woman jail fiction unpu...,print iranian woman sentenc six year prison ir...,1


In [30]:
dataset_Tfid['label'] = dataset_Tfid['label'].replace({'FAKE': 1, 'TRUE': 0, 'Fake':1, 'Real':0, 'True':0, 'REAL': 0})


In [31]:
import pandas as pd

# Assuming 'df' is your DataFrame
# Check and print rows where the 'label' column contains string values
column_name = 'label'  # Replace with the actual column name in your DataFrame

# Filter rows with string values in the 'label' column
string_rows = dataset_Tfid[dataset_Tfid[column_name].apply(lambda x: isinstance(x, str))]

# Count the number of rows with string values
count_string_rows = len(string_rows)

# Print the count
print(f"Number of rows with string values in the '{column_name}' column: {count_string_rows}")

# Print the rows
print("\nRows with string values:")
print(string_rows)


Number of rows with string values in the 'label' column: 0

Rows with string values:
Empty DataFrame
Columns: [title, text, label]
Index: []


In [32]:
Y=dataset_Tfid.pop('label')

In [33]:
Y.values

array([1, 0, 1, ..., 0, 0, 0])

In [34]:
col=['content']
dataset_Tfid_content=pd.DataFrame(columns=col)
dataset_Tfid_content['content']=dataset_Tfid['title']+' '+dataset_Tfid['text']
dataset_Tfid_content.shape


(35190, 1)

In [35]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
X=dataset_Tfid_content['content'].values
vectorizer.fit(X)
X_label = vectorizer.transform(X)
X_label.shape

(35190, 108627)

In [36]:
col=['content']
unlabeled_data_title=pd.DataFrame(columns=col)
unlabeled_data_title['content']=unlabeled_data['title']+' '+unlabeled_data['text']
unlabeled_data_title.shape

(82112, 1)

In [37]:
# converting the textual data to numerical data
vectorizer1 = TfidfVectorizer()
Z=unlabeled_data_title['content'].values
vectorizer1.fit(Z)
X_unlabel = vectorizer1.transform(Z)


In [38]:
X_unlabel.shape

(82112, 159011)

In [39]:
combined_data = pd.concat([dataset_Tfid_content['content'], unlabeled_data_title['content']], axis=0)
vectorizer = TfidfVectorizer()
X_combined = vectorizer.fit_transform(combined_data)

X_labeled = vectorizer.transform(dataset_Tfid_content['content'])
X_unlabeled = vectorizer.transform(unlabeled_data_title['content'])

In [40]:
Y_value=Y.values

In [41]:
print(X_labeled)

  (0, 170985)	0.05245377570557459
  (0, 170894)	0.020328659384524096
  (0, 168822)	0.04673577771364503
  (0, 168816)	0.0956702121867596
  (0, 168749)	0.042027721417381966
  (0, 168697)	0.012359299055411074
  (0, 167585)	0.01715251239728547
  (0, 166341)	0.017955875123165627
  (0, 166206)	0.013583619011229712
  (0, 166089)	0.031128629357328262
  (0, 165956)	0.013309998278248181
  (0, 165588)	0.012889180937954636
  (0, 164141)	0.025296211984276853
  (0, 164076)	0.03017286750063937
  (0, 162610)	0.017023237944112767
  (0, 160473)	0.06325933309983671
  (0, 159443)	0.03389216895591256
  (0, 159116)	0.017993446698815863
  (0, 158159)	0.0392000825046848
  (0, 157009)	0.011920998436672063
  (0, 156835)	0.0471151224820326
  (0, 156741)	0.12493332180520851
  (0, 156499)	0.07805222531207115
  (0, 154418)	0.03999893701201339
  (0, 153568)	0.02765455620535052
  :	:
  (35188, 119700)	0.1538423422307
  (35188, 114674)	0.09140722092623646
  (35188, 94754)	0.14833985354906232
  (35188, 88469)	0.1242543

In [42]:

X_unlabeled.shape

(82112, 174402)

In [43]:
from scipy.sparse import vstack
from sklearn.metrics import recall_score
X_train, X_test, Y_train, Y_test = train_test_split(X_labeled, Y_value, test_size = 0.2, stratify=Y, random_state=2)

#num_rows_half = X_test.shape[0] // 2

#X1_test,X2_test=X_test[:num_rows_half,:],X_test[num_rows_half:,:]
#Y1_test,Y2_test=Y_test[:num_rows_half],Y_test[num_rows_half:]

num_rows_third = X_unlabeled.shape[0] // 3

# Split the CSR matrix into three sets
X1_unlabeled = X_unlabeled[:num_rows_third, :]
X2_unlabeled = X_unlabeled[num_rows_third: 2 * num_rows_third, :]
X3_unlabeled = X_unlabeled[2 * num_rows_third:, :]

num_rows_test=X_test.shape[0]//3
X1_test,X2_test,X3_test=X_test[:num_rows_test,:],X_test[num_rows_test: 2*num_rows_test,:],X_test[2*num_rows_test:,:]
Y1_test,Y2_test,Y3_test=Y_test[:num_rows_test],Y_test[num_rows_test: 2*num_rows_test],Y_test[2*num_rows_test:]


# Train an initial model on the labeled data
model1 = LogisticRegression(max_iter=1000)
model1.fit(X_train, Y_train)

# Use the trained model to predict on unlabeled data
pseudo_labels_1 = model1.predict(X1_unlabeled)

# Combine labeled and pseudo-labeled data
X1_combined=vstack([X_train, X1_unlabeled])
y1_combined = np.concatenate([Y_train, pseudo_labels_1])

# Retrain the model on the combined data
model1.fit(X1_combined, y1_combined)

# Evaluate the model on the validation set
y1_pred = model1.predict(X1_test)
accuracy_1 = accuracy_score(Y1_test, y1_pred)
print(f"Accuracy after self-training on 1st set: {accuracy_1}")

# Repeat for second half of unlabeled data
pseudo_labels_2 = model1.predict(X2_unlabeled)

# Combine labeled and pseudo-labeled data
X2_combined=vstack([X1_combined, X2_unlabeled])
y2_combined = np.concatenate([y1_combined, pseudo_labels_2])

# Retrain the model on the combined data
model1.fit(X2_combined, y2_combined)

# Evaluate the model on the validation set
y2_pred = model1.predict(X2_test)
accuracy_2 = accuracy_score(Y2_test, y2_pred)
print(f"Accuracy after self-training on 2nd set: {accuracy_2}")


# Repeat for second half of unlabeled data
pseudo_labels_3 = model1.predict(X3_unlabeled)

# Combine labeled and pseudo-labeled data
X3_combined=vstack([X2_combined, X3_unlabeled])
y3_combined = np.concatenate([y2_combined, pseudo_labels_3])

# Retrain the model on the combined data
model1.fit(X3_combined, y3_combined)

# Evaluate the model on the validation set
y3_pred = model1.predict(X3_test)
accuracy_3 = accuracy_score(Y3_test, y3_pred)
print(f"Accuracy after self-training on 3rd set: {accuracy_3}")

recall_1 = recall_score(Y1_test, y1_pred)
print(f"Recall after self-training on 1st set: {recall_1}")

# Calculate recall for the 2nd set
recall_2 = recall_score(Y2_test, y2_pred)
print(f"Recall after self-training on 2nd set: {recall_2}")

# Calculate recall for the 3rd set
recall_3 = recall_score(Y3_test, y3_pred)
print(f"Recall after self-training on 3rd set: {recall_3}")

Accuracy after self-training on 1st set: 0.8938618925831202
Accuracy after self-training on 2nd set: 0.8913043478260869
Accuracy after self-training on 3rd set: 0.8891730605285593
Recall after self-training on 1st set: 0.8959731543624161
Recall after self-training on 2nd set: 0.9117896522476675
Recall after self-training on 3rd set: 0.9051217464315701


In [44]:
X_train_supervised, X_test_supervised, Y_train_supervised, Y_test_supervised = train_test_split(X_labeled, Y_value, test_size = 0.2, stratify=Y, random_state=2)
model = LogisticRegression()
model.fit(X_train_supervised, Y_train_supervised)
# accuracy score on the training data
#X_train_prediction = model.predict(X_train)
#training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
# accuracy score on the test data
X_test_prediction = model.predict(X_test_supervised)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test_supervised)
print('Accuracy score of the test data : ', test_data_accuracy)
from sklearn.metrics import recall_score

# Calculate recall for the test data
recall_test = recall_score(Y_test_supervised, X_test_prediction)
print(f"Recall on the test data: {recall_test}")


Accuracy score of the test data :  0.8962773515203183
Recall on the test data: 0.9084783829309376
