In [13]:
import dagshub
dagshub.init(repo_owner='rohanjoshi2005', repo_name='my-first-repo', mlflow=True)



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=1837e8d2-0f5b-43c6-b290-06b1690a6a8e&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=61fadb3153b7d6addf8c7c168df36df45eb3875b5f0727901f7f5afe1294e8b9




In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv("Reddit_Data.csv")
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [3]:
df.dropna(inplace=True)

In [4]:
df.drop_duplicates(inplace=True)

In [5]:
df=df[~(df["clean_comment"].str.strip()== '')]

In [6]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [7]:
import nltk
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rohan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rohan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\rohan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [8]:
# preprocessing function

def cleaning(text):
    # converting to lowercase
    text=text.lower()
    
    # remove url's
    text = re.sub(r'http\S+|www\S+|https\S+','', text, flags=re.MULTILINE)

    # Remove user mentions and hashtags
    text = re.sub(r'@\w+|\#', '', text)

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))

    # tokens = text.split()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer=WordNetLemmatizer()
    tokens=[lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)


In [9]:
# Apply cleaning to your dataset
df["clean_comment"]=df["clean_comment"].apply(cleaning)

In [12]:
import mlflow
import mlflow.sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
# Vectorize the comments using bag of words
vectorizer=CountVectorizer(max_features=10000)

In [15]:
X=vectorizer.fit_transform(df["clean_comment"]).toarray()
y=df["category"]

In [16]:
X.shape

(36793, 10000)

In [17]:
y

0        1
1        1
2       -1
3        0
4        1
        ..
37244    0
37245    1
37246    0
37247    1
37248    0
Name: category, Length: 36793, dtype: int64

In [19]:
mlflow.set_experiment("Baseline")

<Experiment: artifact_location='mlflow-artifacts:/878722e03282489982d5c91901a7b631', creation_time=1751463119587, experiment_id='2', last_update_time=1751463119587, lifecycle_stage='active', name='Baseline', tags={}>

In [20]:
# split the data into training and testing sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

In [22]:
with mlflow.start_run() as run:
  # Log a description for a run
  mlflow.set_tag("mlflow.runName","RandomForest_Baseline_TrainTestSplit")
  mlflow.set_tag("experiment_type","baseline")
  mlflow.set_tag("model_type","RandomForestClassifier")

  # Add a description
  mlflow.set_tag("description","Baseline RandomForest model for sentiment analysis using bag of words (BOW)")

  # log parameters for vectorizer
  mlflow.log_param("vectorizer_type","CountVectorizer")
  mlflow.log_param("Vectorizer_max_features",vectorizer.max_features)

  # Log Random Forest parameters
  n_estimators=200
  max_depth=15
  mlflow.log_param("n_estimators",n_estimators)
  mlflow.log_param("max_depth",max_depth)

  # Create and train
  model=RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,random_state=42)
  model.fit(X_train,y_train)

  # Make predictions on the test set
  y_pred=model.predict(X_test)

  # Log Metrics for each class and accuracy
  accuracy=accuracy_score(y_test,y_pred)
  mlflow.log_metric("accuracy",accuracy)

  classification_rep=classification_report(y_test,y_pred,output_dict=True)

  for label,metrics in classification_rep.items():
    if isinstance(metrics,dict):
      for metric,value in metrics.items():
        mlflow.log_metric(f"{label}_{metric}",value)

  # Confusion matrix plot
  conf_matrix=confusion_matrix(y_test,y_pred)
  plt.figure(figsize=(8,6))
  sns.heatmap(conf_matrix,annot=True,fmt="d",cmap="Blues")
  plt.xlabel("Predicted")
  plt.ylabel("Actual")
  plt.title("Confusion matrix")


  # Save and log the confusion matrix plot
  plt.savefig("confusion_matrix.png")
  mlflow.log_artifact("confusion_matrix.png")

  # Log the model
  mlflow.sklearn.log_model(model,"model")

  # Log the random forest model
  mlflow.sklearn.log_model(model,"Random_Forest_Model")

  # optionally log the dataset
  df.to_csv("dataset.csv",index=False)
  mlflow.log_artifact("dataset.csv")

# display accuracy
print(f"Accuracy : {accuracy}")


🏃 View run RandomForest_Baseline_TrainTestSplit at: https://dagshub.com/rohanjoshi2005/my-first-repo.mlflow/#/experiments/2/runs/5c7f6770afff4613957f0fa9bb822571
🧪 View experiment at: https://dagshub.com/rohanjoshi2005/my-first-repo.mlflow/#/experiments/2


MemoryError: Unable to allocate 1.10 GiB for an array with shape (29434, 10000) and data type float32