In [99]:
import asyncio
import asyncpraw
import pandas as pd
import nest_asyncio
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Replace with your actual credentials
client_id = ''
client_secret = '='
user_agent = 'MyRedditApp/0.1 by your_username'

reddit = praw.Reddit(client_id=client_id,
                     client_secret=client_secret,
                     user_agent=user_agent)


In [102]:
# Apply the nest_asyncio patch to allow nested event loops
nest_asyncio.apply()

async def collect_reddit_comments(subreddit_name, keyword, limit=1000):
    reddit = asyncpraw.Reddit(
        client_id=client_id,
        client_secret=client_secret,
        user_agent=user_agent
    )

    subreddit = await reddit.subreddit(subreddit_name)
    comments = []
    count = 0
    after = None

    while len(comments) < limit:
        try:
            async for submission in subreddit.search(keyword, limit=None, params={'after': after}):
                await submission.load()
                submission.comment_limit = 0
                submission.comments.replace_more(limit=0)

                for comment in submission.comments.list():
                    if isinstance(comment, asyncpraw.models.Comment):
                        author_name = comment.author.name if comment.author else '[deleted]'
                        comments.append([comment.body, author_name, comment.created_utc])
                        count += 1

                        if count >= limit:
                            break

                after = submission.id  # Set the 'after' parameter for pagination

                if count >= limit:
                    break

            if count >= limit:
                break

        except asyncpraw.exceptions.APIException as e:
            print(f"API exception occurred: {e}")
            wait_time = 60  # Wait for 1 minute before retrying
            print(f"Waiting for {wait_time} seconds before retrying...")
            await asyncio.sleep(wait_time)

    return comments[:limit]  # Return up to 'limit' number of comments

async def main():
    comments = await collect_reddit_comments('sarcasm', 'sarcastic', limit=5000)  # Adjust limit as needed
    df = pd.DataFrame(comments, columns=['comment', 'author', 'created_utc'])
    df.to_csv('reddit_comments.csv', index=False)
    print(f"Total comments collected: {len(df)}")
    print(df.head())

# Run the main function
await main()


  submission.comments.replace_more(limit=0)
ERROR:asyncio:Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7b60858edbd0>
ERROR:asyncio:Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7b6085c0c640>


Total comments collected: 5000
                                             comment                author  \
0  Woops, I dropped my monster condom for my magn...               manwae1   
1                                 That's disgusting.  DelightfulHelper9204   
2  There was an episode of Always Sunny where Fra...    Either-Computer635   
3  It only works if theyre used. That's how they ...            shits4gigs   
4                            Try doing it ironically     East_Bicycle_9283   

    created_utc  
0  1.719068e+09  
1  1.719052e+09  
2  1.719074e+09  
3  1.719117e+09  
4  1.719065e+09  


In [100]:


def clean_comment(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'/u/\w+', '', text)  # Remove user mentions
    text = re.sub(r'r/\w+', '', text)  # Remove subreddit mentions
    text = re.sub(r'\n', ' ', text)  # Remove newlines
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # Remove non-alphanumeric characters
    return text.lower()

# Load data
df = pd.read_csv('reddit_comments.csv')
df['cleaned_comment'] = df['comment'].apply(clean_comment)

# Manually label the data (for demonstration, we'll assume labels are provided)
# 0 for non-sarcastic, 1 for sarcastic
labels = [0, 1] * (len(df) // 2)
if len(labels) < len(df):
    labels.append(0)  # Add one more label to match the length

df['label'] = labels

# Remove rows with empty or NaN comments
df = df.dropna(subset=['cleaned_comment'])
df = df[df['cleaned_comment'].str.strip() != '']

# Save the labeled data
df.to_csv('labeled_reddit_comments.csv', index=False)


In [101]:

# Load labeled data
df = pd.read_csv('labeled_reddit_comments.csv')

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_comment'], df['label'], test_size=0.2, random_state=42)

# Define a pipeline with a Random Forest classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier(random_state=42))
])

param_grid = {
    'tfidf__max_features': [10000, 20000, None],  # Adjusted limit for max_features
    'clf__n_estimators': [50, 100],
    'clf__max_depth': [None, 10],
    'clf__min_samples_split': [2, 5],
    'clf__min_samples_leaf': [1, 2]
}


# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=1, error_score='raise')
grid_search.fit(X_train, y_train)  # Pass X_train directly here

# Evaluate the best model from GridSearchCV
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Print evaluation metrics
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))




Fitting 5 folds for each of 48 candidates, totalling 240 fits


ERROR:asyncio:Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7b6085f9da20>


Accuracy: 0.9746707193515705
              precision    recall  f1-score   support

           0       0.98      0.97      0.98       522
           1       0.97      0.98      0.97       465

    accuracy                           0.97       987
   macro avg       0.97      0.97      0.97       987
weighted avg       0.97      0.97      0.97       987



In [77]:
import joblib

# Save the trained model
joblib.dump(model, 'trained_model.pkl')

# Save the TfidfVectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

print("Model and vectorizer saved as 'trained_model.pkl' and 'tfidf_vectorizer.pkl' respectively.")


Model and vectorizer saved as 'trained_model.pkl' and 'tfidf_vectorizer.pkl' respectively.


In [103]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import joblib

# Load the trained model and vectorizer
model = joblib.load('trained_model.pkl')  # Replace with your actual model file
vectorizer = joblib.load('tfidf_vectorizer.pkl')  # Replace with your actual vectorizer file

# Sample text to test
sample_text = "Sure, because taking advice from a microwave manual is exactly how I planned my day."

# Replace with your actual preprocessing steps
def preprocess_text(text):

    text = text.lower()
    text = ''.join([char for char in text if char.isalnum() or char == ' '])  # Remove punctuation
    return text

# Preprocess and vectorize the sample text
preprocessed_text = preprocess_text(sample_text)
X_sample = vectorizer.transform([preprocessed_text])

# Predict using the model
prediction = model.predict(X_sample)[0]

# Interpret the prediction
if prediction == 1:
    print(f"The text '{sample_text}' is sarcastic.")
else:
    print(f"The text '{sample_text}' is not sarcastic.")



The text 'Sure, because taking advice from a microwave manual is exactly how I planned my day.' is sarcastic.


In [107]:
model = joblib.load('trained_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

async def fetch_reddit_post_comments(submission_url, limit=5):
    reddit = asyncpraw.Reddit(
        client_id=client_id,
        client_secret=client_secret,
        user_agent=user_agent
    )

    submission = await reddit.submission(url=submission_url)
    await submission.load()

    comments = []
    async for comment in submission.comments:
        if isinstance(comment, asyncpraw.models.Comment):
            comments.append(comment.body)
            if len(comments) >= limit:
                break

    return comments[:limit]

async def main():
    submission_url = 'https://www.reddit.com/r/pics/comments/1dq9ats/after_the_presidential_debate_joe_biden_greeted/'
    comments = await fetch_reddit_post_comments(submission_url, limit=10)

    # Preprocess comments and predict with the model
    preprocessed_comments = [preprocess_text(comment) for comment in comments]
    X_comments = vectorizer.transform(preprocessed_comments)
    predictions = model.predict(X_comments)

    # Filter sarcastic comments
    sarcastic_comments = [comments[i] for i in range(len(comments)) if predictions[i] == 1]

    print("Top 10 comments:")
    for idx, comment in enumerate(comments, start=1):
        print(f"{idx}. {comment}")

    print("\nSarcastic comments detected:")
    for idx, sarcastic_comment in enumerate(sarcastic_comments, start=1):
        print(f"{idx}. {sarcastic_comment}")

# Function to preprocess text (adjust based on your preprocessing steps)
def preprocess_text(text):
    # Example: lowercase and remove punctuation
    text = text.lower()
    text = ''.join([char for char in text if char.isalnum() or char == ' '])  # Remove punctuation
    return text

# Run the main function
await main()

Top 10 comments:
1. It looks like this post is about Politics. Various methods of filtering out content relating to Politics can be found [here](https://www.reddit.com/r/pics/wiki/v2/resources/filter/politics).

*I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/pics) if you have any questions or concerns.*
2. The debate was a national embarrassment
3. Why can't I see the other comments
4. They should be sitting on a porch playing with grandchildren not running a country. Where are the young political leaders? Or even the 60 year olds?
5. Politics aside, I think we can all agree last nights debate was an utter shambles and if this is the best we can come up with out of 333M Americans, then god help us.

Edit. The “god” part is clearly a figure of speech people, please chill.
6. I still can’t believe these are our two options
7. This is all just so incredibly sad.
8. She needed to help him down the stairs (

  comments = await fetch_reddit_post_comments(submission_url, limit=10)
