In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Folder ID of the Google Drive folder containing the CSV files
folder_id = '13F9CFT1LSEWeMtNUScOYNRqndhpkYCiu'  # Replace 'FOLDER_ID' with the actual folder ID

# List all files in the specified folder
file_list = drive.ListFile({'q': f"'{folder_id}' in parents and trashed=false"}).GetList()

# Filter out only the CSV files
csv_files = [file for file in file_list if file['title'].endswith('.csv')]

# Download CSV files from Google Drive
dfs = []
for file in csv_files:
    file_id = file['id']
    downloaded = drive.CreateFile({'id': file_id})
    downloaded.GetContentFile(f"{downloaded['title']}")
    dfs.append(pd.read_csv(f"{downloaded['title']}"))

# Concatenate data from CSV files
combined_df = pd.concat(dfs, ignore_index=True)

# Data preprocessing
combined_df.dropna(subset=['Statement'], inplace=True)

# Split the data into train and test sets
X = combined_df['Statement']
y = combined_df['Type of Fallacy']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# AdaBoost Classifier with reduced parameters
adaboost_classifier = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=100, learning_rate=0.3)
adaboost_classifier.fit(X_train_tfidf, y_train)

# Evaluate the AdaBoost Classifier
adaboost_accuracy = accuracy_score(y_test, adaboost_classifier.predict(X_test_tfidf))

# Print the accuracy
print("AdaBoost Classifier Accuracy:", adaboost_accuracy)



AdaBoost Classifier Accuracy: 0.6587550380653829


In [None]:
# Function to predict fallacy type with probabilities
def predict_fallacy_probabilities(statement):
    statement_tfidf = vectorizer.transform([statement])
    probabilities = adaboost_classifier.predict_proba(statement_tfidf)[0]
    fallacy_types = adaboost_classifier.classes_
    result = {fallacy_types[i]: probabilities[i] for i in range(len(fallacy_types))}
    return result

# Example usage:
statement = "Your argument is invalid because you are too young to understand."
predicted_probabilities = predict_fallacy_probabilities(statement)
print("Predicted Fallacy Probabilities:")
for fallacy_type, probability in predicted_probabilities.items():
    print(f"{fallacy_type}: {probability}")

Predicted Fallacy Probabilities:
Anecdotal: 0.00014761598844381148
Appeal to Authority Fallacy: 5.6839695712700756e-05
Appeal to Nature Fallacy: 0.10029262941974744
Appeal to Worse Problems Fallacy: 0.0008673094117773975
Bandwagon Fallacy: 0.0027795042558456406
Circular Reasoning: 0.024002073726587848
False Cause: 0.007716197438610764
Genetic Fallacy: 0.2158381174223862
Irrelevant Authority: 0.049167108663016795
Loaded Question: 7.895870138441785e-06
Middle Ground Fallacy: 0.0006198682527265694
Non Sequitur: 0.1429043664773341
Personal Incredulity: 0.14481749939462085
Special Pleading: 0.2177566263313954
The Gambler's Fallacy: 0.09302634765165607


In [None]:
new_statement = "Attacking an opposing individual instead of their argument. For example, dismissing climate activist Greta Thunberg due to her young age instead of her ideas."

# Preprocess the new statement
new_statement_tfidf = vectorizer.transform([new_statement])

# Predict the fallacy type
predicted_fallacy = adaboost_classifier.predict(new_statement_tfidf)

# Print the predicted fallacy type
print("Predicted Fallacy Type:", predicted_fallacy[0])

ValueError: X has 4854 features, but AdaBoostClassifier is expecting 5467 features as input.

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Folder ID of the Google Drive folder containing the CSV files
folder_id = '13F9CFT1LSEWeMtNUScOYNRqndhpkYCiu'  # Replace 'FOLDER_ID' with the actual folder ID

# List all files in the specified folder
file_list = drive.ListFile({'q': f"'{folder_id}' in parents and trashed=false"}).GetList()

# Filter out only the CSV files
csv_files = [file for file in file_list if file['title'].endswith('.csv')]

# Download CSV files from Google Drive
dfs = []
for file in csv_files:
    file_id = file['id']
    downloaded = drive.CreateFile({'id': file_id})
    downloaded.GetContentFile(f"{downloaded['title']}")
    dfs.append(pd.read_csv(f"{downloaded['title']}"))

# Concatenate data from CSV files
combined_df = pd.concat(dfs, ignore_index=True)

# Data preprocessing
combined_df.dropna(subset=['Statement'], inplace=True)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(combined_df['Statement'])
y = combined_df['Type of Fallacy']

# Train the classifier
adaboost_classifier = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=100, learning_rate=0.3)
adaboost_classifier.fit(X, y)

# Function to predict fallacy type with probabilities
def predict_fallacy_probabilities(statement):
    statement_tfidf = vectorizer.transform([statement])
    probabilities = adaboost_classifier.predict_proba(statement_tfidf)[0]
    fallacy_types = adaboost_classifier.classes_
    result = {fallacy_types[i]: probabilities[i] for i in range(len(fallacy_types))}
    return result

# Example usage:
statement = "Your argument is invalid because you are too young to understand."
predicted_probabilities = predict_fallacy_probabilities(statement)
print("Predicted Fallacy Probabilities:")
for fallacy_type, probability in predicted_probabilities.items():
    print(f"{fallacy_type}: {probability}")




Predicted Fallacy Probabilities:
Anecdotal: 8.732886113761646e-05
Appeal to Authority Fallacy: 4.477275353950533e-05
Appeal to Nature Fallacy: 0.11287061738996047
Appeal to Worse Problems Fallacy: 0.0005016663735114283
Bandwagon Fallacy: 0.0017287259533690178
Circular Reasoning: 0.03060675344121273
False Cause: 0.008728520742472727
Genetic Fallacy: 0.1480376918452858
Irrelevant Authority: 0.035348529931279044
Loaded Question: 1.2885542666625987e-05
Middle Ground Fallacy: 0.0003564150659120162
Non Sequitur: 0.0665182115856579
Personal Incredulity: 0.10289331469760893
Special Pleading: 0.4185527994595116
The Gambler's Fallacy: 0.07371176635687457


In [None]:
# Get the predicted probabilities for each class label
y_pred_proba = adaboost_classifier.predict_proba(X_test_tfidf)

# Print the predicted probabilities for the first few samples
print("Predicted Probabilities:")
print(y_pred_proba[:5])

# Calculate the maximum probability along each row to get the predicted class
predicted_classes = adaboost_classifier.classes_[y_pred_proba.argmax(axis=1)]

# Print the predicted classes for the first few samples
print("Predicted Classes:")
print(predicted_classes[:5])

# Calculate the similarity percentage
similarity_percentage = y_pred_proba.max(axis=1) * 100

# Print the similarity percentage for the first few samples
print("Similarity Percentage:")
print(similarity_percentage[:5])


Predicted Probabilities:
[[2.66038404e-04 9.25643894e-05 2.45131831e-02 2.35089155e-03
  6.83522852e-03 1.76143560e-02 3.50603559e-02 1.82139874e-01
  4.62438553e-02 1.21202226e-05 1.64130615e-03 2.56660558e-02
  1.91438583e-01 1.71921515e-01 2.94204073e-01]
 [8.31489597e-02 1.19967887e-04 2.28457794e-02 1.85182702e-03
  2.02704421e-03 2.87596265e-01 4.03697736e-02 1.11252856e-02
  1.11438261e-01 1.26996733e-05 2.28136508e-04 1.77810370e-01
  1.60356689e-01 9.54158269e-02 5.65311456e-03]
 [1.10916520e-04 5.99276973e-05 2.17475700e-03 4.20483582e-01
  8.41617444e-03 1.15103367e-03 1.04631154e-01 5.68584170e-02
  2.34176505e-01 6.96188314e-06 2.14548782e-03 8.80978801e-03
  7.13025698e-02 8.00307316e-02 9.64199344e-03]
 [2.22899981e-04 2.06234319e-05 7.38429761e-04 6.04827622e-01
  8.11286108e-04 2.28956928e-04 5.47897022e-02 2.73544545e-02
  2.52117472e-02 4.48528741e-06 7.50253170e-04 4.26293142e-03
  1.17285342e-01 1.51209067e-01 1.22821990e-02]
 [1.51820053e-01 4.05815265e-04 1.23993

In [None]:
# Import necessary libraries
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Folder ID of the Google Drive folder containing the CSV files
folder_id = '13F9CFT1LSEWeMtNUScOYNRqndhpkYCiu'  # Replace 'FOLDER_ID' with the actual folder ID

# List all files in the specified folder
file_list = drive.ListFile({'q': f"'{folder_id}' in parents and trashed=false"}).GetList()

# Filter out only the CSV files
csv_files = [file for file in file_list if file['title'].endswith('.csv')]

# Download CSV files from Google Drive
dfs = []
for file in csv_files:
    file_id = file['id']
    downloaded = drive.CreateFile({'id': file_id})
    downloaded.GetContentFile(f"{downloaded['title']}")
    dfs.append(pd.read_csv(f"{downloaded['title']}"))

# Concatenate data from CSV files
combined_df = pd.concat(dfs, ignore_index=True)

# Data preprocessing
combined_df.dropna(subset=['Statement'], inplace=True)

# Split the data into train and test sets
X = combined_df['Statement']
y = combined_df['Type of Fallacy']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Create a pipeline with feature selection and AdaBoost classifier
pipeline = Pipeline([
    ('feature_selection', SelectKBest(chi2)),
    ('classifier', AdaBoostClassifier(base_estimator=DecisionTreeClassifier()))
])

# Define a grid of hyperparameters for grid search
param_grid = {
    'feature_selection__k': [100, 300, 500],  # Adjust these values based on your dataset
    'classifier__base_estimator__max_depth': [1, 2],  # Decision tree depths to prevent overfitting
    'classifier__n_estimators': [50, 100, 200],  # Number of estimators
    'classifier__learning_rate': [0.01, 0.1, 0.3]  # Learning rates
}

# Initialize GridSearchCV with the pipeline and parameter grid
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train_tfidf, y_train)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
best_model_accuracy = accuracy_score(y_test, best_model.predict(X_test_tfidf))

# Print the accuracy of the best model
print("Best Model Accuracy:", best_model_accuracy)



Best Model Accuracy: 0.8889386475593373


In [None]:
# Import necessary libraries
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Folder ID of the Google Drive folder containing the CSV files
folder_id = '13F9CFT1LSEWeMtNUScOYNRqndhpkYCiu'  # Replace 'FOLDER_ID' with the actual folder ID

# List all files in the specified folder
file_list = drive.ListFile({'q': f"'{folder_id}' in parents and trashed=false"}).GetList()

# Filter out only the CSV files
csv_files = [file for file in file_list if file['title'].endswith('.csv')]

# Download CSV files from Google Drive
dfs = []
for file in csv_files:
    file_id = file['id']
    downloaded = drive.CreateFile({'id': file_id})
    downloaded.GetContentFile(f"{downloaded['title']}")
    dfs.append(pd.read_csv(f"{downloaded['title']}"))

# Concatenate data from CSV files
combined_df = pd.concat(dfs, ignore_index=True)

# Data preprocessing
combined_df.dropna(subset=['Statement'], inplace=True)

# Text preprocessing function
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    cleaned_words = [lemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stopwords.words('english')]
    return " ".join(cleaned_words)

# Apply text preprocessing
combined_df['Statement'] = combined_df['Statement'].apply(preprocess_text)

# Split the data into train and test sets
X = combined_df['Statement']
y = combined_df['Type of Fallacy']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Create a pipeline with feature selection and RandomForest classifier
pipeline = Pipeline([
    ('feature_selection', SelectKBest(chi2)),
    ('classifier', RandomForestClassifier())
])

# Define a grid of hyperparameters for grid search
param_grid = {
    'feature_selection__k': [100, 300, 500],
    'classifier__max_depth': [10, 20, 30],
    'classifier__n_estimators': [100, 200, 300],
    'classifier__min_samples_split': [2, 5, 10]
}

# Initialize GridSearchCV with the pipeline and parameter grid
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train_tfidf, y_train)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
predictions = best_model.predict(X_test_tfidf)
best_model_accuracy = accuracy_score(y_test, predictions)

# Print the accuracy and classification report of the best model
print("Best Model Accuracy:", best_model_accuracy)
print("Classification Report:\n", classification_report(y_test, predictions))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Best Model Accuracy: 0.9090909090909091
Classification Report:
                                   precision    recall  f1-score   support

                       Anecdotal       1.00      0.87      0.93       118
     Appeal to Authority Fallacy       1.00      1.00      1.00        97
        Appeal to Nature Fallacy       1.00      0.99      0.99        99
Appeal to Worse Problems Fallacy       1.00      1.00      1.00       647
               Bandwagon Fallacy       0.97      0.95      0.96       128
              Circular Reasoning       0.61      1.00      0.76       266
                     False Cause       0.98      0.88      0.92       112
                 Genetic Fallacy       0.99      0.73      0.84        96
            Irrelevant Authority       0.95      0.77      0.85       113
                 Loaded Question       0.99      0.97      0.98       127
           Middle Ground Fallacy       1.00      0.96      0.98        23
                    Non Sequitur       0.88    

In [None]:
# Function to predict the probability of each type of fallacy in a sentence
def predict_fallacy_probabilities(sentence, vectorizer, model):
    # Preprocess the sentence using the same preprocessing function
    preprocessed_sentence = preprocess_text(sentence)

    # Transform the sentence using the trained TF-IDF vectorizer
    sentence_tfidf = vectorizer.transform([preprocessed_sentence])

    # Use the trained model to predict the probabilities for each fallacy
    fallacy_probabilities = model.predict_proba(sentence_tfidf)

    return fallacy_probabilities[0]

# Example usage:
input_sentence = "If we make an exception for Bijal’s service dog, then other people will want to bring their dogs. Then everybody will bring their dog, and before you know it, our restaurant will be overrun with dogs, their slobber, their hair, and all the noise they make, and nobody will want to eat here anymore."
fallacy_probabilities = predict_fallacy_probabilities(input_sentence, vectorizer, best_model)

# Assuming the model classes are accessible via best_model.classes_
fallacy_types = best_model.classes_

# Print the probabilities for each fallacy type
print("Probabilities for each type of fallacy:")
for fallacy, probability in zip(fallacy_types, fallacy_probabilities):
    print(f"{fallacy}: {probability:.4f}")

Probabilities for each type of fallacy:
Anecdotal: 0.0292
Appeal to Authority Fallacy: 0.0310
Appeal to Nature Fallacy: 0.0125
Appeal to Worse Problems Fallacy: 0.0270
Bandwagon Fallacy: 0.0512
Circular Reasoning: 0.1995
False Cause: 0.0566
Genetic Fallacy: 0.0819
Irrelevant Authority: 0.0983
Loaded Question: 0.0497
Middle Ground Fallacy: 0.0052
Non Sequitur: 0.0631
Personal Incredulity: 0.1739
Special Pleading: 0.0946
The Gambler's Fallacy: 0.0263


In [None]:
# Function to predict the type of fallacy in a sentence
def predict_fallacy(sentence, vectorizer, model):
    # Preprocess the sentence using the same preprocessing function
    preprocessed_sentence = preprocess_text(sentence)

    # Transform the sentence using the trained TF-IDF vectorizer
    sentence_tfidf = vectorizer.transform([preprocessed_sentence])

    # Use the trained model to predict the fallacy
    fallacy_prediction = model.predict(sentence_tfidf)

    return fallacy_prediction[0]

# Example usage:
input_sentence = "If we make an exception for Bijal’s service dog, then other people will want to bring their dogs. Then everybody will bring their dog, and before you know it, our restaurant will be overrun with dogs, their slobber, their hair, and all the noise they make, and nobody will want to eat here anymore."
fallacy_type = predict_fallacy(input_sentence, vectorizer, best_model)
print(f"The predicted fallacy for the input sentence is: {fallacy_type}")

The predicted fallacy for the input sentence is: Circular Reasoning
