In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import opinion_lexicon
import nltk

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train.head()

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Time,Summary,Text,Score
0,914403,B0009W5KHM,AV6QDP8Q0ONK4,2,2,1341014400,GOOD FUN FILM,While most straight to DVD films are not worth...,5.0
1,354887,6303079709,A2I8RXJN80A2D2,0,0,1168819200,Movie Review,"I have wanted this one for sometime, also. I ...",5.0
2,1407653,B004H0M2XC,A3FHV3RV8Z12E6,0,0,1386201600,When is it a good time to Consent?,Actually this was a pretty darn good indie fil...,4.0
3,1377458,B003ZJ9536,A12VLTA3ZHVPUY,1,1,1348704000,TRUTH,Episodes 37 to 72 of the series press on in a ...,5.0
4,475323,630574453X,A13NM1PES9OXVN,2,3,970012800,Intelligent and bittersweet -- stays with you,"I was really impressed with this movie, but wa...",3.0


In [3]:
print(test.columns)

Index(['Id', 'Score'], dtype='object')


In [4]:
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
import os

nltk.download('vader_lexicon', quiet=True)
sia = SentimentIntensityAnalyzer()

def positive_negative_ratio(text):
    if not isinstance(text, str):
        return 0.0  # neutral sentiment

    scores = sia.polarity_scores(text)
    positive_count = scores['pos']
    negative_count = scores['neg']

    if positive_count + negative_count == 0:
        return 0.0  # neutral sentiment

    return positive_count / (positive_count + negative_count) # otherwise ratio

cached_filename = 'train_with_ratios.csv'

# cache data files to save computational time for researching different model params
if os.path.exists(cached_filename):
    train = pd.read_csv(cached_filename)
    print("Loaded cached data from:", cached_filename)
else:
    train['PosNegRatio'] = train['Text'].apply(positive_negative_ratio)
    train['HelpfulnessRatio'] = train['HelpfulnessNumerator'] / (train['HelpfulnessDenominator'] + 1e-5)

    columns_to_save = ['Id', 'PosNegRatio', 'HelpfulnessRatio', 'Score']
    train[columns_to_save].to_csv(cached_filename, index=False)
    print(f"Calculated and saved data to: {cached_filename}")

Loaded cached data from: train_with_ratios.csv


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib  # to save models

train = pd.read_csv('train_with_ratios.csv')
train_set = train[~train['Id'].isin(test['Id'])] # remove test ids

In [6]:
# to visualize distribution for possible weights in the model
score_distribution = train_set.groupby('Score').size().reset_index(name='Count')

print("Data Distribution of Scores:")
print(score_distribution)

counts = dict(zip(score_distribution['Score'], score_distribution['Count']))
total_samples = score_distribution['Count'].sum()
class_weights = {score: total_samples / count for score, count in counts.items()}

print("Class Weights:", class_weights)

Data Distribution of Scores:
   Score   Count
0    1.0   91190
1    2.0   89678
2    3.0  176082
3    4.0  335228
4    5.0  793163
Class Weights: {1.0: 16.28841978287093, 2.0: 16.563047793215727, 3.0: 8.43550732045297, 4.0: 4.430838116147815, 5.0: 1.8726806469792463}


In [None]:
import json

X = train_set[['PosNegRatio', 'HelpfulnessRatio']]
y = train_set['Score'].astype(int)  # ensure int for classification

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.9, random_state=42)

param_grid = { # for CV search
    'n_estimators': [100, 200, 500], 
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': [None, 'balanced']
}

def print_progress_search(cv_results):
    for i, params in enumerate(cv_results['params']):
        mean_score = cv_results['mean_test_score'][i]
        print(f"Model {i + 1}: {params} => Accuracy: {mean_score:.4f}")

# search for cached model        
model_filename = 'best_rf_model.joblib'
best_params_filename = 'best_params.json'
try:
    if os.path.exists(model_filename):
        print("Loading cached model...")
        rf_random = joblib.load(model_filename)
    else:
        raise FileNotFoundError  # trigger exception to train a new model

except FileNotFoundError:
    print("No cached model found. Training a new model...")
    rf = RandomForestClassifier(random_state=42)
    rf_random = RandomizedSearchCV(rf, param_grid, n_iter=10, cv=3, random_state=42, n_jobs=2, verbose=2)
    rf_random.fit(X_train, y_train)
    
    print_progress_search(rf_random.cv_results_)
    joblib.dump(rf_random, model_filename) # save optimal model
    print(f"Model saved to {model_filename}.")
    
    best_params = rf_random.best_params_
    with open(best_params_filename, 'w') as f:
        json.dump(best_params, f, indent=4)
        print(f"Best parameters saved to {best_params_filename}.")

y_pred = rf_random.predict(X_val)

# model evaluation
print("validation Accuracy:", accuracy_score(y_val, y_pred))
print("\nclassification Report:\n", classification_report(y_val, y_pred))
print("\nconfusion Matrix:\n", confusion_matrix(y_val, y_pred))

In [None]:
test_set = train[train['Id'].isin(test['Id'])] # for creating submission.csv
X_test = test_set[['PosNegRatio', 'HelpfulnessRatio']]

predicted_scores = rf_random.predict(X_test)
rounded_scores = np.clip(np.round(predicted_scores), 1, 5).astype(float)

submission_df = pd.DataFrame({ # matches format of submission.csv
    'Id': test_set['Id'],
    'Score': rounded_scores
})

submission_df.to_csv('submission.csv', index=False)
print("saved submission!")