In [22]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
 
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

# Download NLTK stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shilp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
train_df = pd.read_csv('foods_training.csv', sep=',', quotechar='"',nrows=50000, encoding='latin1')
test_df = pd.read_csv('foods_testing.csv', sep=',', quotechar='"', encoding='latin1')

In [3]:
# Step 1: Data Preprocessing

# Convert helpfulness column to a float
def process_helpfulness(helpfulness):
    try:
        numerator, denominator = map(int, helpfulness.split('/'))
        return round(numerator / denominator, 2) if denominator != 0 else 0
    except ValueError:
        return 0

In [4]:
train_df['helpfulness'] = train_df['helpfulness'].apply(process_helpfulness)
test_df['helpfulness'] = test_df['helpfulness'].apply(process_helpfulness)

In [5]:
# Convert 'time' column to datetime and extract year, month, and day
train_df['datetime'] = pd.to_datetime(train_df['time'], unit='s')
test_df['datetime'] = pd.to_datetime(test_df['time'], unit='s')

train_df['year'] = train_df['datetime'].dt.year
train_df['month'] = train_df['datetime'].dt.month
train_df['day'] = train_df['datetime'].dt.day

test_df['year'] = test_df['datetime'].dt.year
test_df['month'] = test_df['datetime'].dt.month
test_df['day'] = test_df['datetime'].dt.day

In [6]:
# Concatenate 'summary' and 'text' columns into a new 'reviews' column
train_df['txt_reviews'] = train_df['summary'] + ' ' + train_df['text']
test_df['txt_reviews'] = test_df['summary'] + ' ' + test_df['text']

In [7]:
# Text Cleaning
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def review_cleaning(text):
    text = re.sub('[^a-zA-Z]', ' ', str(text))
    text = text.lower().split()
    text = [ps.stem(word) for word in text if word not in stop_words]
    return ' '.join(text)

In [8]:
train_df['cleaned_reviews'] = train_df['txt_reviews'].apply(review_cleaning)
test_df['cleaned_reviews'] = test_df['txt_reviews'].apply(review_cleaning)

In [9]:
# Step 2: Feature Extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train = vectorizer.fit_transform(train_df['cleaned_reviews'])
X_test = vectorizer.transform(test_df['cleaned_reviews'])

In [10]:
# Encode target variable (score_level)
# label_encoder = LabelEncoder()
y_train = train_df['score_level']

In [11]:
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

# Initialize the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)

# Perform 5-fold cross-validation
dt_cv_scores = cross_val_score(dt_model, X_train_split, y_train_split, cv=5, scoring='accuracy')

# Print results
print(f"Decision Tree - Mean CV Accuracy: {np.mean(dt_cv_scores):.4f} ± {np.std(dt_cv_scores):.4f}")

Decision Tree - Mean CV Accuracy: 0.7812 ± 0.0022


In [13]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)

# Perform 5-fold cross-validation
lr_cv_scores = cross_val_score(lr_model, X_train_split, y_train_split, cv=5, scoring='accuracy')

# Print results
print(f"Logistic Regression - Mean CV Accuracy: {np.mean(lr_cv_scores):.4f} ± {np.std(lr_cv_scores):.4f}")

Logistic Regression - Mean CV Accuracy: 0.8660 ± 0.0033


In [14]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize the KNN model
knn_model = KNeighborsClassifier()

# Perform 5-fold cross-validation
knn_cv_scores = cross_val_score(knn_model, X_train_split, y_train_split, cv=5, scoring='accuracy')

# Print results
print(f"KNN - Mean CV Accuracy: {np.mean(knn_cv_scores):.4f} ± {np.std(knn_cv_scores):.4f}")

KNN - Mean CV Accuracy: 0.4227 ± 0.2569


In [15]:
# Choose the Best Model (Assuming Random Forest performed best)
best_model = LogisticRegression(random_state=42, max_iter=1000)
best_model.fit(X_train_split, y_train_split)

In [16]:
# Evaluate the Best Model on the Validation Set
y_val_pred = best_model.predict(X_val_split)
print("Validation Set Metrics:")
print(f"Accuracy: {accuracy_score(y_val_split, y_val_pred)}")
print("Classification Report:")
print(classification_report(y_val_split, y_val_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val_split, y_val_pred))

Validation Set Metrics:
Accuracy: 0.8679
Classification Report:
              precision    recall  f1-score   support

        high       0.89      0.98      0.93      7839
         low       0.75      0.66      0.70      1379
      medium       0.54      0.14      0.22       782

    accuracy                           0.87     10000
   macro avg       0.73      0.59      0.62     10000
weighted avg       0.85      0.87      0.85     10000

Confusion Matrix:
[[7659  126   54]
 [ 429  912   38]
 [ 501  173  108]]


In [17]:
# Train the Final Model on the Full Training Data
best_model.fit(X_train, y_train)

In [18]:
# Make Predictions on the Test Data
y_test_pred = best_model.predict(X_test)

In [19]:
# Save predictions to sample_submission.csv
submission_df = pd.DataFrame({'ID': test_df['ID'],'score_level': y_test_pred})
submission_df.to_csv('output_submissions.csv', index=False)

In [20]:
submission_df

Unnamed: 0,ID,score_level
0,0,high
1,20,high
2,49,low
3,50,high
4,78,high
...,...,...
4995,55839,high
4996,55848,high
4997,55851,high
4998,55894,high
