In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import numpy as np

In [None]:
df = pd.read_csv('C:/Users/Rudra Thakar/Jupyter/preprocessed_IFND_dataset.csv')
df.head()

In [None]:
print("Dataset Info:")
print(df.info())
df.rename(columns={'Unnamed: 0': 'Serial. No'}, inplace=True)
df.rename(columns={'Web': 'News Media'}, inplace=True)

In [None]:
print("\nMissing Values:")
missing_values = df.isnull().sum()
print(missing_values)

plt.figure(figsize=(8, 5))
missing_values.plot(kind='bar', color='skyblue')
plt.title('Missing Values per Column')
plt.xlabel('Columns')
plt.ylabel('Number of Missing Values')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
print("\nLabel Distribution:")
label_counts = df['Label'].value_counts()
print(label_counts)

plt.figure(figsize=(4, 4))  # Reduced figure size
plt.pie(label_counts, labels=label_counts.index, autopct='%1.1f%%', startangle=90, colors=['#66b3ff', '#ff9999'])
plt.title('Label Distribution (Percentage)')
plt.axis('equal')  # Equal aspect ratio ensures pie is a circle
plt.tight_layout()
plt.show()

In [None]:
df['Statement_Length'] = df['Statement'].apply(lambda x: len(str(x).split()))

plt.figure(figsize=(10, 6))
sns.kdeplot(data=df, x='Statement_Length', hue='Label', fill=False)
plt.title('Statement Length Distribution by Label')
plt.xlabel('Number of Words')
plt.ylabel('Density')
plt.show()

In [None]:
df['Statement_Length'] = df['Statement'].apply(lambda x: len(str(x).split()))

# Normalize the histogram by calculating relative frequencies
plt.figure(figsize=(10, 6))
sns.kdeplot(data=df, x='Statement_Length', hue='Label', common_norm=False)
plt.title('Normalized Statement Length Distribution by Label')
plt.xlabel('Number of Words')
plt.ylabel('Density')
plt.show()

Sources Publishing Real News

In [None]:
real_order=df[df['Label']=='TRUE']['News Media'].value_counts().sort_values(ascending=False).index

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(y='News Media', data=df[df['Label']=='TRUE'],order=real_order,palette='summer')
plt.xlabel('Count',fontsize=12)
plt.ylabel('Source',fontsize=12)
plt.title('Sources of Real News',fontsize=15)
plt.show()


Sources of Fake News

In [None]:
fake_order=df[df['Label']=='Fake']['News Media'].value_counts().sort_values(ascending=False).index

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(y='News Media',data=df[df['Label']=='Fake'],order=fake_order,palette='autumn')
plt.xlabel('Count',fontsize=12)
plt.ylabel('Source',fontsize=12)
plt.title('Sources of Fake News',fontsize=20)
plt.show()

Common Sources of Fake and Real News

In [None]:
new=[]
for x in df[df['Label']=='Fake']['News Media'].unique():
    if x in df[df['Label']=='TRUE']['News Media'].unique():
        new.append(x)
print(new)

In [None]:
df['common']=df['News Media'].apply(lambda x: x if x in new else 0)

In [None]:
df1=df[df['common']!=0]

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(y='common',data=df1,hue='Label',palette='viridis')
plt.xlabel('Count',fontsize=12)
plt.ylabel('Source',fontsize=12)
plt.legend(loc='best', title='Label',fontsize=10)
plt.title('Common Sources of Real and Fake News',fontsize=20)
plt.show()

In [None]:
true_statements = df[df['Label'] == 'TRUE']['Statement']
fake_statements = df[df['Label'] == 'Fake']['Statement']  # Adjust 'Fake' to 'FALSE' if needed

# Initialize CountVectorizer (remove stop words for cleaner results)
vectorizer = CountVectorizer(stop_words='english', max_features=15)

# Fit and transform for TRUE statements
true_vectorized = vectorizer.fit_transform(true_statements)
true_word_counts = true_vectorized.toarray().sum(axis=0)
true_words = vectorizer.get_feature_names_out()

# Fit and transform for Fake statements
vectorizer = CountVectorizer(stop_words='english', max_features=15)
fake_vectorized = vectorizer.fit_transform(fake_statements)
fake_word_counts = fake_vectorized.toarray().sum(axis=0)
fake_words = vectorizer.get_feature_names_out()

# Create DataFrames for plotting
true_df = pd.DataFrame({'Word': true_words, 'Count': true_word_counts}).sort_values('Count', ascending=False)
fake_df = pd.DataFrame({'Word': fake_words, 'Count': fake_word_counts}).sort_values('Count', ascending=False)

# Print the top 15 words
print("Top 15 Words in TRUE News:")
print(true_df)
print("\nTop 15 Words in Fake News:")
print(fake_df)

# Plot the results
plt.figure(figsize=(12, 6))

# TRUE News
plt.subplot(1, 2, 1)
sns.barplot(x='Count', y='Word', data=true_df, palette='Blues_d')
plt.title('Top 15 Words in TRUE News')
plt.xlabel('Frequency')
plt.ylabel('Word')

# Fake News
plt.subplot(1, 2, 2)
sns.barplot(x='Count', y='Word', data=fake_df, palette='Reds_d')
plt.title('Top 15 Words in Fake News')
plt.xlabel('Frequency')
plt.ylabel('Word')

plt.tight_layout()
plt.show()