In [20]:
import pandas as pd

# Load the dataset
file_path = "/content/reply_classification_dataset - reply_classification_dataset.csv.csv"
try:
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    exit() # Exit if the file is not found

# Display basic information and the first few rows
print("Initial Data Info:")
df.info()
display(df.head())

# Handle missing values (if any) - in this dataset, there are no missing values
# Dropping rows with missing values is not necessary for this dataset based on df.info() output, but kept as a general practice.
initial_rows = df.shape[0]
df.dropna(inplace=True)
rows_after_dropna = df.shape[0]
if initial_rows > rows_after_dropna:
    print(f"Dropped {initial_rows - rows_after_dropna} rows with missing values.")


# Clean the text data by converting to lowercase and removing leading/trailing whitespace
df['reply'] = df['reply'].str.lower().str.strip()
df['label'] = df['label'].str.lower().str.strip()

# Display basic information and the first few rows after preprocessing
print("\nData Info After Preprocessing:")
df.info()
display(df.head())

Initial Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2129 entries, 0 to 2128
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   reply   2129 non-null   object
 1   label   2129 non-null   object
dtypes: object(2)
memory usage: 33.4+ KB


Unnamed: 0,reply,label
0,Can we discuss pricing??,NEUTRAL
1,"Im excited to explore this further, plz send c...",POSITIVE
2,We not looking for new solutions.,negative
3,Could u clarify features included?,neutral
4,"lets,, schedule a meeting to dive deeper",positive



Data Info After Preprocessing:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2129 entries, 0 to 2128
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   reply   2129 non-null   object
 1   label   2129 non-null   object
dtypes: object(2)
memory usage: 33.4+ KB


Unnamed: 0,reply,label
0,can we discuss pricing??,neutral
1,"im excited to explore this further, plz send c...",positive
2,we not looking for new solutions.,negative
3,could u clarify features included?,neutral
4,"lets,, schedule a meeting to dive deeper",positive


In [21]:
# Define the target variable using the preprocessed data
y = df['label']

# Display the distribution of the target variable
print("Distribution of the target variable:")
display(y.value_counts())

Distribution of the target variable:


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
positive,710
negative,710
neutral,709


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Text Vectorization
# Adjusted max_features for potentially better performance and reduced dimensionality
tfidf_vectorizer = TfidfVectorizer(max_features=2000, stop_words='english')
X = tfidf_vectorizer.fit_transform(df['reply']).toarray()

# Split data into training and testing sets
# Stratify is important for maintaining the proportion of each class in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000) # Increased max_iter for convergence
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
# Use weighted average for multiclass classification
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score (weighted): {f1:.4f}")

# Display a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9883
F1 Score (weighted): 0.9883

Classification Report:
              precision    recall  f1-score   support

    negative       0.99      0.99      0.99       142
     neutral       0.99      0.98      0.99       142
    positive       0.99      0.99      0.99       142

    accuracy                           0.99       426
   macro avg       0.99      0.99      0.99       426
weighted avg       0.99      0.99      0.99       426

