In [None]:
!pip install pandas

# Import Pnadas
# Load the files stored in the Data folder by defining them and then loading them

import pandas as pd

# Define paths for train, validation, and test CSV files
train_file = './Data/train.csv'
valid_file = './Data/valid.csv'
test_file = './Data/test.csv'

# Load the data into pandas DataFrames
train_df = pd.read_csv(train_file)
valid_df = pd.read_csv(valid_file)
test_df = pd.read_csv(test_file)

#Preprocess the csv file text
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# Function to clean and preprocess text
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove non-alphabetic characters
    text = text.lower().split()  # Convert to lowercase and tokenize
    stops = set(stopwords.words("english"))  # Load stop words
    text = [word for word in text if word not in stops]  # Remove stop words
    lemmatizer = WordNetLemmatizer()  # Initialize lemmatizer
    text = [lemmatizer.lemmatize(word) for word in text]  # Lemmatize words
    return ' '.join(text)

# Apply preprocessing to the 'statement' column instead of 'text'
train_df['cleaned_text'] = train_df['statement'].apply(preprocess_text)
valid_df['cleaned_text'] = valid_df['statement'].apply(preprocess_text)
test_df['cleaned_text'] = test_df['statement'].apply(preprocess_text)

# Feature Extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Transform the cleaned text into TF-IDF features
X_train = vectorizer.fit_transform(train_df['cleaned_text']).toarray()
X_valid = vectorizer.transform(valid_df['cleaned_text']).toarray()
X_test = vectorizer.transform(test_df['cleaned_text']).toarray()

# Assuming binary labels (0: disinformation, 1: truth), map the labels
y_train = train_df['label'].map({'truth': 1, 'disinformation': 0})
y_valid = valid_df['label'].map({'truth': 1, 'disinformation': 0})
y_test = test_df['label'].map({'truth': 1, 'disinformation': 0})


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Map the numeric labels to binary classes
y_train = train_df['label'].map({0: 1, 1: 1, 2: 1, 3: 0, 4: 0, 5: 0})
y_valid = valid_df['label'].map({0: 1, 1: 1, 2: 1, 3: 0, 4: 0, 5: 0})
y_test = test_df['label'].map({0: 1, 1: 1, 2: 1, 3: 0, 4: 0, 5: 0})

# Train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate on the validation set
y_valid_pred = model.predict(X_valid)
print(f"Validation Accuracy: {accuracy_score(y_valid, y_valid_pred)}")
print(f"Classification Report on Validation Data:\n {classification_report(y_valid, y_valid_pred)}")

# Predict on the test set
y_test_pred = model.predict(X_test)

# Evaluate on the test set
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred)}")
print(f"Classification Report on Test Data:\n {classification_report(y_test, y_test_pred)}")