In [1]:
import pandas as pd

In [2]:
#Data file reading
try:
    pse_isr_df=pd.read_csv("D:\\DSMM\\Term3\\bhavik\\Project\\pse_isr_reddit_comments.csv")
except Exception as e:
    print("Exception while reading file",e)

In [3]:
#Data Exploration
pse_isr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 652892 entries, 0 to 652891
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   comment_id    652892 non-null  object
 1   score         652892 non-null  int64 
 2   self_text     652890 non-null  object
 3   subreddit     652892 non-null  object
 4   created_time  652892 non-null  object
dtypes: int64(1), object(4)
memory usage: 24.9+ MB


In [4]:
pse_isr_df['self_text'].head(5)

0    &gt;Microplastics have contaminated every corn...
1    No I mean for the past 15 years prior to Octob...
2    The West needs to stop playing Daddy and stepp...
3                                Israel was a mistake.
4    It’s the kind of argument you expect from Euro...
Name: self_text, dtype: object

In [5]:
null_counts = pse_isr_df.isnull().sum()
print("Total number of null values in the DataFrame: ", null_counts)
pse_isr_df=pse_isr_df.dropna()
pse_isr_df.count()



Total number of null values in the DataFrame:  comment_id      0
score           0
self_text       2
subreddit       0
created_time    0
dtype: int64


comment_id      652890
score           652890
self_text       652890
subreddit       652890
created_time    652890
dtype: int64

In [6]:
#Added new column 'vote' which shows upvote and downvote for the comment based on the score.
pse_isr_df['vote']=pse_isr_df['score'].apply(lambda x: 'Upvote' if x>0 else 'Downvote')
#pse_isr_df['vote']=pse_isr_df['score'].apply(lambda x: 'Upvote' if x>0 else ('Downvote' if x<0 else 'Neutral'))
pse_isr_df.head(5)


Unnamed: 0,comment_id,score,self_text,subreddit,created_time,vote
0,k8ponkf,1,&gt;Microplastics have contaminated every corn...,worldnews,2023-11-10 23:13:48+00:00,Upvote
1,k8pokm4,1,No I mean for the past 15 years prior to Octob...,IsraelPalestine,2023-11-10 23:13:14+00:00,Upvote
2,k8pogl6,1,The West needs to stop playing Daddy and stepp...,IsraelPalestine,2023-11-10 23:12:28+00:00,Upvote
3,k8pogi2,1,Israel was a mistake.,worldnewsvideo,2023-11-10 23:12:26+00:00,Upvote
4,k8pofhz,1,It’s the kind of argument you expect from Euro...,AskMiddleEast,2023-11-10 23:12:15+00:00,Upvote


In [9]:
#Using Labelencoder to convert the labels of upvote as 1 and Downvote to 0.
from sklearn.preprocessing import LabelEncoder


vote = pse_isr_df['vote'].values

encoder = LabelEncoder()
encoded_vote = encoder.fit_transform(vote)
pse_isr_df['EncodedVote']=encoded_vote
pse_isr_df.head(5)

Unnamed: 0,comment_id,score,self_text,subreddit,created_time,vote,EncodedVote
0,k8ponkf,1,&gt;Microplastics have contaminated every corn...,worldnews,2023-11-10 23:13:48+00:00,Upvote,1
1,k8pokm4,1,No I mean for the past 15 years prior to Octob...,IsraelPalestine,2023-11-10 23:13:14+00:00,Upvote,1
2,k8pogl6,1,The West needs to stop playing Daddy and stepp...,IsraelPalestine,2023-11-10 23:12:28+00:00,Upvote,1
3,k8pogi2,1,Israel was a mistake.,worldnewsvideo,2023-11-10 23:12:26+00:00,Upvote,1
4,k8pofhz,1,It’s the kind of argument you expect from Euro...,AskMiddleEast,2023-11-10 23:12:15+00:00,Upvote,1


In [10]:
#Check whether the data is balanced or not
pse_isr_df['EncodedVote'].value_counts()

#Data is highly imbalanced

1    569948
0     82942
Name: EncodedVote, dtype: int64

In [11]:
import nltk
import re
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [12]:
#Data Cleaning before model building

# Initialize the lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()
    
    # Remove special characters and lowercase the text
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Lemmatization and removing stopwords
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    # Join the tokens back into a sentence
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text


pse_isr_df['clean_text'] = pse_isr_df['self_text'].apply(clean_text)
pse_isr_df['clean_text']



0         microplastics contaminated every corner planet...
1                       mean past 15 year prior october 7th
2         west need stop playing daddy stepping set coun...
3                                            israel mistake
4         kind argument expect european people moving re...
                                ...                        
652887                                           u bullshit
652888    united state dotted west bank gaza strip altho...
652889    country sometimes map adapt country view matte...
652890    cant give something pretended support cynical ...
652891    head islamic jihad denounced arab attempt norm...
Name: clean_text, Length: 652890, dtype: object

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix


In [14]:
vectorizer = CountVectorizer()
X = pse_isr_df['clean_text'].values
y = pse_isr_df['EncodedVote'].values
Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,test_size=0.3)
Xtrain_CV = vectorizer.fit_transform(Xtrain)

In [15]:

from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE  
from collections import Counter

print("Class distribution before SMOTE:", Counter(ytrain))

# Apply SMOTE
smote = SMOTE()
Xtrain_sm, ytrain_sm = smote.fit_resample(Xtrain_CV, ytrain)

# Display the class distribution after SMOTE
print("Class distribution after SMOTE:", Counter(ytrain_sm))
 

Class distribution before SMOTE: Counter({1: 399104, 0: 57919})
Class distribution after SMOTE: Counter({1: 399104, 0: 399104})


In [16]:
NB = MultinomialNB()
NB.fit(Xtrain_sm,ytrain_sm)

In [17]:
Xtest_CV = vectorizer.transform(Xtest)
Ypredict = NB.predict(Xtest_CV)

In [18]:
accuracyScore = accuracy_score(ytest,Ypredict)*100
print("Accuracy Score:::",accuracyScore)
report = classification_report(ytest, Ypredict)
print("Classification Report:::\n",report)

confusion = confusion_matrix(ytest, Ypredict)
print("Confusion Matrix:::\n", confusion)

Accuracy Score::: 68.6026742636585
Classification Report:::
               precision    recall  f1-score   support

           0       0.17      0.37      0.23     25023
           1       0.89      0.73      0.80    170844

    accuracy                           0.69    195867
   macro avg       0.53      0.55      0.52    195867
weighted avg       0.80      0.69      0.73    195867

Confusion Matrix:::
 [[  9156  15867]
 [ 45630 125214]]


In [25]:
msg = input("Enter Message: ")
msgInput = vectorizer.transform([msg])
predict = NB.predict(msgInput)

if(predict==0):
    print("Downvote")
else:
    print("Upvote")

Downvote


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix 

In [20]:
clf = LogisticRegression().fit(Xtrain_sm, ytrain_sm)

predictions = clf.predict(Xtest_CV)

print('Accuracy score: ', format(accuracy_score(ytest, predictions)))
print('Precision score: ', format(precision_score(ytest, predictions)))
print('Recall score: ', format(recall_score(ytest, predictions)))
print('F1 score: ', format(f1_score(ytest, predictions)))
print('\nConfusion Matrix :\n', confusion_matrix(ytest, predictions)) 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy score:  0.6466684025384573
Precision score:  0.8827654254037117
Recall score:  0.6860235068249397
F1 score:  0.7720577579278817

Confusion Matrix :
 [[  9458  15565]
 [ 53641 117203]]
