In [3]:
import dagshub
import mlflow
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE, BorderlineSMOTE 
from imblearn.pipeline import Pipeline

# Load Dataset

In [4]:
pwd

'D:\\reddit-sentiment-analysis\\experimentations'

In [5]:
# load the data
PATH = r"../data/reddit_cleaned_data.csv"
df = pd.read_csv(PATH)

df.head()

Unnamed: 0,comment,sentiment,stop_words_count,neg_stop_words_count,word_count,named_sentiment,sentence_length,response,nouns,verbs,adjectives
0,family mormon never tried explain still stare ...,1,13,0,39,positive,Long,0,16,4,2
1,buddhism much lot compatible christianity espe...,1,59,3,196,positive,Very Long,0,58,30,24
2,seriously say thing first get complex explain ...,-1,40,0,86,negative,Very Long,0,18,7,10
3,learned want teach different focus goal wrappi...,0,15,1,29,neutral,Long,0,7,4,2
4,benefit may want read living buddha living chr...,1,45,1,112,positive,Very Long,0,20,19,12


In [6]:
# Only use the comments column for now

final_df = df[['comment','named_sentiment']].copy()

final_df

Unnamed: 0,comment,named_sentiment
0,family mormon never tried explain still stare ...,positive
1,buddhism much lot compatible christianity espe...,positive
2,seriously say thing first get complex explain ...,negative
3,learned want teach different focus goal wrappi...,neutral
4,benefit may want read living buddha living chr...,positive
...,...,...
29746,hona hai vaccination education insurance end m...,neutral
29747,agree push make nation either pity pakistan in...,negative
29748,jesus,neutral
29749,downvote karna tha par upvote hogaya,neutral


In [7]:
# check for missing values

final_df.isna().sum()

comment            180
named_sentiment      0
dtype: int64

In [8]:
# missing values

final_df.loc[final_df['comment'].isna()]

Unnamed: 0,comment,named_sentiment
255,,neutral
620,,neutral
678,,neutral
685,,neutral
784,,neutral
...,...,...
29411,,neutral
29571,,neutral
29598,,neutral
29702,,neutral


In [9]:
# remove the rows having missing values

print("Rows in data before removing missing values", final_df.shape[0])

final_df = final_df.dropna()

print("Rows in data after removing missing values", final_df.shape[0])

Rows in data before removing missing values 29751
Rows in data after removing missing values 29571


In [10]:
# check for duplicates

final_df.loc[final_df['comment'].duplicated(keep=False)].sort_values('comment')

Unnamed: 0,comment,named_sentiment
11011,aadhar,neutral
15254,aadhar,neutral
20346,aap,neutral
2346,aap,neutral
3110,aap,neutral
...,...,...
28187,yy,neutral
28172,yy,neutral
28081,yy,neutral
2537,zor bolo,neutral


In [11]:
# delete duplicates from data

final_df.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.drop_duplicates(inplace=True)


In [12]:
final_df.isna().sum()

comment            0
named_sentiment    0
dtype: int64

# Experimentation

In [16]:
# make X and y

X = final_df['comment']
y = final_df['named_sentiment']
X

0        family mormon never tried explain still stare ...
1        buddhism much lot compatible christianity espe...
2        seriously say thing first get complex explain ...
3        learned want teach different focus goal wrappi...
4        benefit may want read living buddha living chr...
                               ...                        
29745                        let janta decide ulema cleric
29746    hona hai vaccination education insurance end m...
29747    agree push make nation either pity pakistan in...
29748                                                jesus
29749                 downvote karna tha par upvote hogaya
Name: comment, Length: 29152, dtype: object

In [17]:
# do train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, 
                                                    random_state=42, stratify=y)

print("The number of rows in train data are ",X_train.shape[0])
print("The number of rows in test data are ",X_test.shape[0])

The number of rows in train data are  23321
The number of rows in test data are  5831


In [18]:
from sklearn.metrics import accuracy_score, classification_report

In [20]:
# make a function to perform experimentation

def do_experimentation(oversampler):
    n_gram = (1,3)
    max_features = 9000
    vect = TfidfVectorizer(max_features=max_features, ngram_range=n_gram)
    clf = RandomForestClassifier(random_state=42,n_jobs=-1)
    model_pipe = Pipeline(steps=[
        ("vec",vect),
        ("sample",oversampler),
        ("clf",clf)
    ])

    # fit the pipeline on training data
    model_pipe.fit(X_train,y_train)
    # calculate predictions
    y_pred = model_pipe.predict(X_test)

    with mlflow.start_run(run_name=f"{oversampler.__class__.__name__}") as run:
        # accuracy score
        accuracy = accuracy_score(y_test,y_pred)
        mlflow.log_metric("accuracy",accuracy)
    
        # classification report
        report = classification_report(y_test,y_pred,output_dict=True)
        # log classification report
        for label, metrics in report.items():
            if isinstance(metrics,dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}",value)
    
        # log the oversampling technique
        mlflow.log_param("over sampler", oversampler)

    return "OK"

In [21]:
# set the tracking uri

mlflow.set_tracking_uri("https://dagshub.com/himanshu1703/reddit-sentiment-analysis.mlflow")

In [22]:
# initialize dagshub

dagshub.init(repo_owner='himanshu1703', repo_name='reddit-sentiment-analysis', mlflow=True)

In [23]:
# set experiment name

mlflow.set_experiment("Ex-4: Select the OverSampling Technique")

2024/10/24 23:01:00 INFO mlflow.tracking.fluent: Experiment with name 'Ex-4: Select the OverSampling Technique' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/248b20f8802c412a8754e365cb65db8a', creation_time=1729791090636, experiment_id='6', last_update_time=1729791090636, lifecycle_stage='active', name='Ex-4: Select the OverSampling Technique', tags={}>

In [25]:
# run the experiment
params = {"random_state":42, "n_jobs":-1}
oversampling_techniques = [SMOTE(**params), 
                           BorderlineSMOTE(**params), 
                           SVMSMOTE(**params), 
                           ADASYN(**params)]

for count, oversampler in enumerate(oversampling_techniques):
    print(f"Run Number {count+1}, STATUS: {do_experimentation(oversampler)}")

2024/10/24 23:08:02 INFO mlflow.tracking._tracking_service.client: 🏃 View run SMOTE at: https://dagshub.com/himanshu1703/reddit-sentiment-analysis.mlflow/#/experiments/6/runs/d91d519712f54bcd8edffa727b3df24d.
2024/10/24 23:08:02 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/himanshu1703/reddit-sentiment-analysis.mlflow/#/experiments/6.


Run Number 1, STATUS: OK


2024/10/24 23:09:08 INFO mlflow.tracking._tracking_service.client: 🏃 View run BorderlineSMOTE at: https://dagshub.com/himanshu1703/reddit-sentiment-analysis.mlflow/#/experiments/6/runs/bce611885591471d9967af53e1722ba3.
2024/10/24 23:09:08 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/himanshu1703/reddit-sentiment-analysis.mlflow/#/experiments/6.


Run Number 2, STATUS: OK


2024/10/24 23:19:54 INFO mlflow.tracking._tracking_service.client: 🏃 View run SVMSMOTE at: https://dagshub.com/himanshu1703/reddit-sentiment-analysis.mlflow/#/experiments/6/runs/c6bd2aff4d1e4216ba315a3ec0c459b6.
2024/10/24 23:19:54 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/himanshu1703/reddit-sentiment-analysis.mlflow/#/experiments/6.


Run Number 3, STATUS: OK


2024/10/24 23:23:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run ADASYN at: https://dagshub.com/himanshu1703/reddit-sentiment-analysis.mlflow/#/experiments/6/runs/e178aef288b0405598304f9a4392c5f8.
2024/10/24 23:23:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/himanshu1703/reddit-sentiment-analysis.mlflow/#/experiments/6.


Run Number 4, STATUS: OK


**From the experiments `SMOTE` Oversampling technique is giving good results**