In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dataset-preprocessed/preprocessed_data.csv
/kaggle/input/ratemeter/sample_submission.csv
/kaggle/input/ratemeter/train.csv
/kaggle/input/ratemeter/test.csv


In [2]:
!pip install nltk



In [3]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

#Download NLTK resources
import nltk
import subprocess
nltk.download('punkt')
nltk.download('stopwords')

try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')

# Load the dataset
# Assuming you have a CSV file with 'text' and 'rating' columns
# dataset = pd.read_csv(r"/kaggle/input/ratemeter/train.csv")
# dataset = dataset.iloc[100000:400000].copy()

# Function for text preprocessing
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back into a string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# Apply preprocessing to the 'text' column
# dataset['preprocessed_review_text'] = dataset['review_text'].apply(preprocess_text)

# Save DataFrame to CSV
# dataset.to_csv('preprocessed_data.csv', index=False)

# # Display the preprocessed data
dataset = pd.read_csv(r"/kaggle/input/dataset-preprocessed/preprocessed_data.csv")
# dataset = dataset.iloc[:400000].copy()
# dataset = dataset.sample(frac=1).reset_index(drop=True)
# print(dataset[['review_text', 'preprocessed_review_text', 'rating']].head())
dataset.info()



[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /kaggle/working/...
[nltk_data]   Package wordnet is already up-to-date!
Archive:  /kaggle/working/corpora/wordnet.zip


replace /kaggle/working/corpora/wordnet/lexnames? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 12 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   user_id                   400000 non-null  object
 1   book_id                   400000 non-null  int64 
 2   review_id                 400000 non-null  object
 3   review_text               400000 non-null  object
 4   date_added                400000 non-null  object
 5   date_updated              400000 non-null  object
 6   read_at                   359053 non-null  object
 7   started_at                277877 non-null  object
 8   n_votes                   400000 non-null  int64 
 9   n_comments                400000 non-null  int64 
 10  rating                    400000 non-null  int64 
 11  preprocessed_review_text  399883 non-null  object
dtypes: int64(4), object(8)
memory usage: 36.6+ MB


In [4]:
# # Feature Extraction using TF-IDF
# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# vectorizer_tfidf = TfidfVectorizer()
columns_to_exclude = ['rating', 'review_id','date_added','date_updated','read_at','started_at']
dataset['combined_text'] = dataset.drop(columns_to_exclude, axis=1).astype('str').apply(lambda x: ' '.join(x), axis=1)

# dataset['combined_text'] = dataset.drop('rating', axis=1).astype('str').apply(lambda x: ' '.join(x), axis=1)
# # dataset['combined_text'] = dataset.astype('str').apply(lambda x: ' '.join(x), axis=1)
# X_tfidf = vectorizer_tfidf.fit_transform(dataset['combined_text'].values.astype('str'))
# print(X_tfidf.toarray())

from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse

# Feature Extraction using TF-IDF
vectorizer_tfidf = TfidfVectorizer(max_features=5000)  # Limiting vocabulary size for memory efficiency

# Fit the vectorizer on the entire dataset
vectorizer_tfidf.fit(dataset['combined_text'].astype('str'))

# Process data in smaller batches to reduce memory consumption
batch_size = 1000
num_batches = len(dataset) // batch_size + (1 if len(dataset) % batch_size != 0 else 0)

# Initialize an empty list to store sparse matrices
X_tfidf_batches = []

for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(dataset))
    
    # Use sparse matrix representation
    X_tfidf_batch = vectorizer_tfidf.transform(dataset['combined_text'].iloc[start_idx:end_idx].astype('str'))
    
    # Append the batch to the list
    if X_tfidf_batch.shape[0] > 0:
        X_tfidf_batches.append(X_tfidf_batch)

# Concatenate the batches vertically if there are any
if X_tfidf_batches:
    X_tfidf_sparse = scipy.sparse.vstack(X_tfidf_batches)
else:
    # Handle the case when the dataset is empty
    X_tfidf_sparse = scipy.sparse.csr_matrix((0, len(vectorizer_tfidf.get_feature_names_out())), dtype=float)

# Optionally, convert to a dense array for further processing or display
X_tfidf_dense = X_tfidf_sparse.toarray()

# Your code continues here...
# (e.g., model training, evaluation, etc.)




In [6]:
X_tfidf.shape

(400000, 638343)

In [7]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.2-py3-none-manylinux2014_x86_64.whl (297.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.1/297.1 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-2.0.2
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
# from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_sparse, dataset['rating'], test_size=0.2, random_state=42)

from sklearn.feature_selection import SelectFromModel

# Fit a logistic regression model for feature selection
model = LogisticRegression()
selector = SelectFromModel(estimator=model)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Random Forest Model
# rf_model = RandomForestClassifier(n_estimators=50, random_state=42)
# rf_model.fit(X_train, y_train)
# rf_pred = rf_model.predict(X_test)

# XGBoost Model
# xgb_model = XGBClassifier(n_estimators=100, random_state=42)
# xgb_model.fit(X_train, y_train)
# xgb_pred = xgb_model.predict(X_test)

# Logistic Regression Model
log_model = LogisticRegression()

# param_grid = {'C': [0.001, 0.01, 0.1, 0.5, 1, 5, 10, 100], 'penalty': ['l1', 'l2']}
# grid_search = GridSearchCV(log_model, param_grid, cv=5, scoring='accuracy')
# param_grid = {'C': [0.4], 'penalty': ['l2']}
# grid_search = GridSearchCV(log_model, param_grid, cv=5, scoring='accuracy')
# grid_search.fit(X_train, y_train)
# best_params = grid_search.best_params_
# print("Best Hyperparameters:", best_params)
# best_model = grid_search.best_estimator_
# test_accuracy = best_model.score(X_test, y_test)
# print("Test Accuracy:", test_accuracy)
# log_pred = best_model.predict(X_test)
log_model.fit(X_train_selected, y_train)
log_pred = log_model.predict(X_test_selected)

# Stochastic Gradient Descent Model
# sgd_model = SGDClassifier()
# sgd_model.fit(X_train, y_train)
# sgd_predictions = sgd_model.predict(X_test)

# Linear SVC
# svm_model = LinearSVC()
# svm_model.fit(X_train, y_train)
# svm_predictions = svm_model.predict(X_test)

# Multinomial Naive Bayes Model
# nb_model = MultinomialNB()
# nb_model.fit(X_train, y_train)

# # Make predictions
# nb_predictions = nb_model.predict(X_test)


# Non Linear SVC
# Standardize the features (important for SVM)
# scaler = StandardScaler(with_mean=False)
# X_train_std = scaler.fit_transform(X_train)
# X_test_std = scaler.transform(X_test)
# # Train the SVM model
# svm_model = SVC(kernel='rbf', C=1.0, random_state=42)
# svm_model.fit(X_train_std, y_train)

# # Make predictions
# svm_predictions = svm_model.predict(X_test_std)


# Ensemble Predictions
# rf_pred = rf_model.predict(X_test)
# xgb_pred = xgb_model.predict(X_test)
# log_pred = log_model.predict(X_test)

# # Combine predictions using majority voting
# ensemble_pred = (rf_pred + xgb_pred + log_pred) // 3  # You can also experiment with different combination strategies

print("F1 Score:", f1_score(y_test, log_pred, average='weighted'))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, log_pred))

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


F1 Score: 0.5245196495416039

Confusion Matrix:
 [[  804   155   212   303   754   539]
 [  148   672   859   393   301   177]
 [  128   305  1935  2593  1257   282]
 [  140    95  1063  7092  7402  1075]
 [  121    59   229  3692 17409  6287]
 [  118    39    84   610  8080 14588]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_

In [10]:
#Now checking the model on test data
dataset_test=pd.read_csv(r"/kaggle/input/ratemeter/test.csv")
# data_test = dataset_test.iloc[:10000].copy()
dataset_test.head()

Unnamed: 0,user_id,book_id,review_id,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,de3a6a28c83cda006b415d45d64674c9,1618,d76ce5becf493e5c653610edb806ffc4,"I'm going to keep this review short, because I...",Tue Jun 09 10:37:48 -0700 2015,Wed Jun 10 13:43:38 -0700 2015,Wed Jun 10 14:08:39 -0700 2015,Tue Jun 09 00:00:00 -0700 2015,8,0
1,d9cfab35a87e32084b1817dfb0e98748,15776309,9c5c9aed79255a1a610dfc153ee90ad6,"You know, I was really stoked to see this come...",Mon Apr 22 09:19:40 -0700 2013,Wed Jul 10 14:34:27 -0700 2013,Wed Jul 10 14:34:27 -0700 2013,Tue Jul 09 00:00:00 -0700 2013,0,0
2,24935a870a46525a37be92775ab18f76,478927,ad26332459cda8f40581fef7a29b800c,This is one of those books where you know you ...,Sat Mar 14 12:16:55 -0700 2015,Sun Apr 05 11:25:00 -0700 2015,Wed Mar 18 00:00:00 -0700 2015,Sat Mar 14 00:00:00 -0700 2015,14,0
3,6ccb40eabdd0db7895aac00963256469,22628,5bb75768be7f2ddcd632336778b67f5a,The perks of being a wallflower \n What does a...,Sat Sep 12 08:43:30 -0700 2015,Sat Sep 12 09:19:29 -0700 2015,,,0,0
4,9f9d0f6e9a6a5797a252ef81abc9421c,13596809,3d7f76ea566b9ce0700772236094d936,"So, I wrote a review for this when I read it b...",Sun Jul 14 19:36:13 -0700 2013,Thu Jun 02 16:45:45 -0700 2016,Sat Mar 23 00:00:00 -0700 2013,,0,0


In [10]:
dataset_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270000 entries, 0 to 269999
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       270000 non-null  object
 1   book_id       270000 non-null  int64 
 2   review_id     270000 non-null  object
 3   review_text   270000 non-null  object
 4   date_added    270000 non-null  object
 5   date_updated  270000 non-null  object
 6   read_at       242459 non-null  object
 7   started_at    187740 non-null  object
 8   n_votes       270000 non-null  int64 
 9   n_comments    270000 non-null  int64 
dtypes: int64(3), object(7)
memory usage: 20.6+ MB


In [None]:
# Apply preprocessing to the 'text' column
dataset_test['preprocessed_review_text'] = dataset_test['review_text'].apply(preprocess_text)

# Display the preprocessed data
print(dataset_test[['review_text', 'preprocessed_review_text']].head())

In [14]:
# columns_to_exclude = ['review_id','date_added','date_updated','read_at','started_at']
# dataset_test['combined_text'] = dataset_test.drop(columns_to_exclude, axis=1).astype('str').apply(lambda x: ' '.join(x), axis=1)
# dataset_test['combined_text'] = dataset_test.astype('str').apply(lambda x: ' '.join(x), axis=1)
# X_tfidf_test = vectorizer_tfidf.transform(dataset_test['combined_text'].values.astype('str'))
# print(X_tfidf_test.toarray())

columns_to_exclude = ['rating', 'review_id','date_added','date_updated','read_at','started_at']
dataset_test['combined_text'] = dataset.drop(columns_to_exclude, axis=1).astype('str').apply(lambda x: ' '.join(x), axis=1)

from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse

# Feature Extraction using TF-IDF
vectorizer_tfidf = TfidfVectorizer(max_features=5000)  # Limiting vocabulary size for memory efficiency

# Fit the vectorizer on the entire dataset
vectorizer_tfidf.fit(dataset_test['combined_text'].astype('str'))

# Process data in smaller batches to reduce memory consumption
batch_size = 1000
num_batches = len(dataset_test) // batch_size + (1 if len(dataset_test) % batch_size != 0 else 0)

# Initialize an empty list to store sparse matrices
X_tfidf_batches = []

for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(dataset_test))
    
    # Use sparse matrix representation
    X_tfidf_batch = vectorizer_tfidf.transform(dataset_test['combined_text'].iloc[start_idx:end_idx].astype('str'))
    
    # Append the batch to the list
    if X_tfidf_batch.shape[0] > 0:
        X_tfidf_batches.append(X_tfidf_batch)

# Concatenate the batches vertically if there are any
if X_tfidf_batches:
    X_tfidf_sparse_test = scipy.sparse.vstack(X_tfidf_batches)
else:
    # Handle the case when the dataset is empty
    X_tfidf_sparse_test = scipy.sparse.csr_matrix((0, len(vectorizer_tfidf.get_feature_names_out())), dtype=float)

# Optionally, convert to a dense array for further processing or display
# X_tfidf_dense_test = X_tfidf_sparse.toarray()

[[0.02752735 0.         0.         ... 0.         0.         0.        ]
 [0.03281758 0.         0.         ... 0.         0.         0.        ]
 [0.0784773  0.         0.         ... 0.         0.         0.        ]
 ...
 [0.06800149 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.18446649 0.         0.         ... 0.         0.         0.        ]]


In [15]:
# Ensemble Predictions
# rf_pred_test = rf_model.predict(X_tfidf_test)
# xgb_pred_test = xgb_model.predict(X_tfidf_test)
log_pred_test = best_model.predict(X_tfidf_sparse_test)

# log_pred_test = rf_model.predict(X_tfidf_test)

# # Combine predictions using majority voting
# ensemble_pred = (rf_pred + xgb_pred + log_pred) // 3

In [16]:
output = pd.DataFrame({'review_id': dataset_test['review_id'].values,
                      'rating': log_pred_test})
output.to_csv('submission11.csv',index=False, header=True)