In [1]:
%conda install joblib nltk xgboost textblob lightgbm

Channels:
 - conda-forge
 - nvidia
 - pytorch
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done


    current version: 24.7.1
    latest version: 24.9.2

Please update conda by running

    $ conda update -n base -c conda-forge conda



## Package Plan ##

  environment location: /home/ec2-user/anaconda3/envs/python3

  added / updated specs:
    - joblib
    - lightgbm
    - nltk
    - textblob
    - xgboost


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    libboost-1.84.0            |       hb8260a3_6         2.7 MB  conda-forge
    liblightgbm-4.5.0          |   cpu_h155599f_3         2.8 MB  conda-forge
    lightgbm-4.5.0             |         cpu_py_3          81 KB  conda-forge
    ocl-icd-2.3.2              |       hd590300_1         133 KB  conda-forge
    ------------------------------------------------------------
                   

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import nltk
import re
import warnings
import multiprocessing

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import accuracy_score

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from joblib import Parallel, delayed

import lightgbm as lgb  # LightGBM for model training

warnings.filterwarnings('ignore')

In [3]:

# Download necessary NLTK data files
nltk.download('stopwords')
nltk.download('wordnet')

# Number of CPU cores
num_cores = multiprocessing.cpu_count() - 4
print(f"Number of CPU cores: {num_cores}")

Number of CPU cores: 188


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# 1. Load Data
print("Loading data...")
train_df = pd.read_csv('train.csv')
test_ids = pd.read_csv('test.csv')['Id']

print("-")

Loading data...
-


In [5]:
# 2. Data Preprocessing
print("Preprocessing data...")

# Split the data
test_data = train_df[train_df['Id'].isin(test_ids)].copy()
train_data = train_df[~train_df['Id'].isin(test_ids)].copy()

# Ensure that the 'Score' column is missing in test_data and present in train_data
train_data = train_data[train_data['Score'].notnull()]
test_data = test_data[test_data['Score'].isnull()]

# Fill missing values in 'Summary' and 'Text' with empty strings
train_data['Summary'] = train_data['Summary'].fillna('')
train_data['Text'] = train_data['Text'].fillna('')
test_data['Summary'] = test_data['Summary'].fillna('')
test_data['Text'] = test_data['Text'].fillna('')

# Combine 'Summary' and 'Text' into a single field 'FullText'
train_data['FullText'] = train_data['Summary'] + ' ' + train_data['Text']
test_data['FullText'] = test_data['Summary'] + ' ' + test_data['Text']


print("-")

Preprocessing data...
-


In [6]:
# 3. Feature Engineering

# 3.1 Text Cleaning Function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize and remove stopwords
    words = [word for word in text.split() if word not in stop_words]
    # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]
    # Join words back into one string
    return ' '.join(words)

# Apply text cleaning in parallel
print("Cleaning text...")

def parallel_apply(df, func):
    df_split = np.array_split(df, num_cores)
    pool = multiprocessing.Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

def clean_text_df(df):
    df['CleanText'] = df['FullText'].apply(clean_text)
    return df

train_data = parallel_apply(train_data, clean_text_df)
test_data = parallel_apply(test_data, clean_text_df)

# Check for empty 'CleanText' entries
train_data['CleanText'].replace('', 'empty', inplace=True)
test_data['CleanText'].replace('', 'empty', inplace=True)

print("-")

Cleaning text...
-


In [7]:
# 4. Feature Engineering

# 4.1 TF-IDF Vectorization with Bigrams and Trigrams
from scipy.sparse import vstack

tfidf = TfidfVectorizer(max_features=100000, ngram_range=(1,3), min_df=5, max_df=0.9, verbose=1 )

print("Fitting TF-IDF vectorizer on training data...")
tfidf.fit(train_data['CleanText'])

print("-")

Fitting TF-IDF vectorizer on training data...
-


In [8]:
# Function to transform a chunk of data
def transform_chunk(chunk):
    return tfidf.transform(chunk)

# Transform training data in parallel
print("Transforming training data with TF-IDF vectorizer in parallel...")
train_chunks = np.array_split(train_data['CleanText'], num_cores)
X_train_tfidf_chunks = Parallel(n_jobs=num_cores)(
    delayed(transform_chunk)(chunk) for chunk in train_chunks
)
X_train_tfidf = vstack(X_train_tfidf_chunks)

# Transform test data in parallel
print("Transforming test data with TF-IDF vectorizer in parallel...")
test_chunks = np.array_split(test_data['CleanText'], num_cores)
X_test_tfidf_chunks = Parallel(n_jobs=num_cores)(
    delayed(transform_chunk)(chunk) for chunk in test_chunks
)
X_test_tfidf = vstack(X_test_tfidf_chunks)

print("-")

Transforming training data with TF-IDF vectorizer in parallel...
Transforming test data with TF-IDF vectorizer in parallel...
-


In [9]:
# 4.2 Additional Features (HelpfulnessRatio, Date Features, etc.)
print("Computing additional features...")

def compute_features(df):
    # Helpfulness Ratio
    df['HelpfulnessRatio'] = np.where(df['HelpfulnessDenominator'] == 0, 0,
                                      df['HelpfulnessNumerator'] / df['HelpfulnessDenominator'])
    # Review Time Features
    df['ReviewTime'] = pd.to_datetime(df['Time'], unit='s')
    df['ReviewYear'] = df['ReviewTime'].dt.year
    df['ReviewMonth'] = df['ReviewTime'].dt.month
    df['ReviewDayOfWeek'] = df['ReviewTime'].dt.dayofweek
    # Review Length
    df['ReviewLength'] = df['CleanText'].apply(lambda x: len(x.split()))
    return df

train_data = parallel_apply(train_data, compute_features)
test_data = parallel_apply(test_data, compute_features)

print("-")

Computing additional features...
-


In [10]:
# Encode 'UserId' and 'ProductId' using frequency encoding
print("Encoding 'UserId' and 'ProductId'...")

def frequency_encoding(column, df_train, df_test):
    freq_enc = df_train[column].value_counts().to_dict()
    df_train[column + '_FreqEnc'] = df_train[column].map(freq_enc)
    df_test[column + '_FreqEnc'] = df_test[column].map(freq_enc)
    # Fill NaNs in test data with minimum frequency
    min_freq = min(freq_enc.values())
    df_test[column + '_FreqEnc'] = df_test[column + '_FreqEnc'].fillna(min_freq)
    return df_train, df_test

train_data, test_data = frequency_encoding('UserId', train_data, test_data)
train_data, test_data = frequency_encoding('ProductId', train_data, test_data)

print("-")

Encoding 'UserId' and 'ProductId'...
-


In [11]:
# Sentiment Analysis using TextBlob
print("Performing sentiment analysis...")
from textblob import TextBlob

def sentiment_analysis(df):
    df['Sentiment'] = df['CleanText'].apply(lambda x: TextBlob(x).sentiment.polarity)
    return df

train_data = parallel_apply(train_data, sentiment_analysis)
test_data = parallel_apply(test_data, sentiment_analysis)

print("-")

Performing sentiment analysis...
-


In [12]:
# 4.3 Finalize Feature Sets
print("Finalizing feature sets...")
numerical_features = ['HelpfulnessRatio', 'ReviewYear', 'ReviewMonth', 'ReviewDayOfWeek',
                      'ReviewLength', 'UserId_FreqEnc', 'ProductId_FreqEnc', 'Sentiment']

print("-")

Finalizing feature sets...
-


In [13]:
# Scale numerical features
scaler = MinMaxScaler()
train_num = scaler.fit_transform(train_data[numerical_features])
test_num = scaler.transform(test_data[numerical_features])

print("-")

-


In [14]:
# Reduce dimensionality with TruncatedSVD
print("Reducing dimensionality...")
svd = TruncatedSVD(n_components=200, random_state=42)
X_train_svd = svd.fit_transform(X_train_tfidf)
X_test_svd = svd.transform(X_test_tfidf)

print("-")

Reducing dimensionality...
-


In [15]:
# Combine all features
print("Combining all features...")
X_train = np.hstack([train_num, X_train_svd])
X_test = np.hstack([test_num, X_test_svd])

y_train = train_data['Score'].astype(int)

print("-")

Combining all features...
-


In [16]:
# 5. Model Training

# 5.1 Prepare LightGBM Dataset
print("Preparing LightGBM dataset...")
lgb_train = lgb.Dataset(X_train, y_train)

print("-")

Preparing LightGBM dataset...
-


In [19]:
print("-")

-


In [28]:
from sklearn.model_selection import train_test_split

# Sample a subset of the data for hyperparameter tuning
print("Sampling data for hyperparameter tuning...")
X_sample, _, y_sample, _ = train_test_split(X_train, y_train, test_size=0.9, random_state=42, stratify=y_train)

# Simplify parameter grid
param_grid = {
    'num_leaves': [31, 63],
    'learning_rate': [0.05, 0.1],
    'n_estimators': [500],
}

lgb_estimator = lgb.LGBMClassifier(objective='multiclass', n_jobs=-1, random_state=42)

# Use RandomizedSearchCV for hyperparameter tuning
print("Initializing RandomizedSearchCV...")
random_search = RandomizedSearchCV(
    estimator=lgb_estimator,
    param_distributions=param_grid,
    n_iter=2,  # Reduced due to simplified grid
    scoring='accuracy',
    cv=2,  # Reduced folds
    verbose=3,
    random_state=42,
    n_jobs=-1
)
print("Fitting RandomizedSearchCV...")
random_search.fit(X_sample, y_sample)

print("Best parameters found:")
print(random_search.best_params_)

# Train the final model on the full dataset
print("Training final model on full dataset...")
best_params = random_search.best_params_

final_model = lgb.LGBMClassifier(
    objective='multiclass',
    n_jobs=-1,
    random_state=42,
    **best_params
)

final_model.fit(
    X_train, y_train,
    eval_metric='multi_logloss',
)


Sampling data for hyperparameter tuning...
Initializing RandomizedSearchCV...
Fitting RandomizedSearchCV...
Fitting 2 folds for each of 2 candidates, totalling 4 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.090457 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 52296
[LightGBM] [Info] Number of data points in the train set: 148534, number of used features: 208
[LightGBM] [Info] Start training from score -2.790454
[LightGBM] [Info] Start training from score -2.807151
[LightGBM] [Info] Start training from score -2.132461
[LightGBM] [Info] Start training from score -1.488582
[LightGBM] [Info] Start training from score -0.627374
Best parameters found:
{'num_leaves': 63, 'n_estimators': 500, 'learning_rate': 0.05}
Training final model on full dataset...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.130716 seconds.
You can set `force_col_wise=true` to remo

In [29]:
# 6. Prediction
print("Predicting on test data...")
y_pred = final_model.predict(X_test)

print("-")

Predicting on test data...
-


In [30]:
# 7. Prepare Submission
print("Preparing submission file...")
submission = pd.DataFrame({'Id': test_data['Id'], 'Score': y_pred.astype(float)})
submission.to_csv('submission.csv', index=False)

print("Done!")

Preparing submission file...
Done!
