In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.linear_model import LinearRegression
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from datetime import datetime
import nltk
from nltk.corpus import stopwords


In [None]:
# Preprocessing of the train and test data sets with one-hot encoding and category mapping

# Read the training and test datasets from CSV files
train_df = pd.read_csv('sample_data/train.csv')
test_df = pd.read_csv('sample_data/test.csv')

In [None]:
# Perform one-hot encoding for the 'category' column in the training dataset
# Create binary columns for each category and append them to the training dataset
one_hot_encoding = pd.get_dummies(train_df.category, prefix='category')
train_df = train_df.join(one_hot_encoding)

# Perform one-hot encoding for the 'category' column in the test dataset
# Create binary columns for each category and append them to the test dataset
one_hot_encoding = pd.get_dummies(test_df.category, prefix='category')
test_df = test_df.join(one_hot_encoding)

In [None]:
# Convert the 'overall' column in the training dataset to integer type
train_df['overall'] = train_df['overall'].astype(int)

# Create a new column 'rating' in the training dataset
# Assign 1 to 'rating' if 'overall' is greater than 3, otherwise assign 0
train_df['rating'] = np.where(train_df['overall'] > 3, 1, 0)

In [None]:
# Convert 'unixReviewTime' column to datetime format in the training dataset
train_df['datetime'] = pd.to_datetime(train_df['unixReviewTime'], unit='s')

# Extract hour, weekday, month, and year information from the datetime column
train_df['hour'] = train_df['datetime'].dt.hour
train_df['weekday'] = train_df['datetime'].dt.dayofweek
train_df['month'] = train_df['datetime'].dt.month
train_df['year'] = train_df['datetime'].dt.year

# Convert 'unixReviewTime' column to datetime format in the test dataset
test_df['datetime'] = pd.to_datetime(test_df['unixReviewTime'], unit='s')

# Extract hour, weekday, month, and year information from the datetime column in the test dataset
test_df['hour'] = test_df['datetime'].dt.hour
test_df['weekday'] = test_df['datetime'].dt.dayofweek
test_df['month'] = test_df['datetime'].dt.month
test_df['year'] = test_df['datetime'].dt.year

In [None]:
# Define a mapping for category labels to numerical values
category_mapping = {
    'automotive': 1,
    'CDs': 2,
    'grocery': 3,
    'cell_phones': 4,
    'sports': 5,
    'toys': 6
}

# Replace 'category' values with numerical equivalents in the training dataset
train_df['category_no'] = train_df['category'].replace(category_mapping)

# Replace 'category' values with numerical equivalents in the test dataset
test_df['category_no'] = test_df['category'].replace(category_mapping)

In [None]:
# Combine 'reviewText' and 'summary' columns into a new 'combined_text' column in the training dataset
train_df['combined_text'] = train_df['reviewText'] + ' ' + train_df['summary']

# Combine 'reviewText' and 'summary' columns into a new 'combined_text' column in the test dataset
test_df['combined_text'] = test_df['reviewText'] + ' ' + test_df['summary']

In [None]:
# Map boolean values in the 'verified' column to numerical values in the training dataset
train_df.verified = train_df.verified.map({False: 0, True: 1})

# Map boolean values in the 'verified' column to numerical values in the test dataset
test_df.verified = test_df.verified.map({False: 0, True: 1})

In [None]:
# Removing Stop words from the text columns

# Fill NaN values in 'reviewText' and 'summary' columns with empty strings in the training dataset
train_df['reviewText'] = train_df['reviewText'].fillna('')
train_df['summary'] = train_df['summary'].fillna('')

# Download stopwords from NLTK
nltk.download('stopwords')

# Define a function to remove stopwords from text
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    cleaned_text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    return cleaned_text

# Apply the remove_stopwords function to 'reviewText' column in the training dataset
train_df['reviewText'] = train_df['reviewText'].apply(remove_stopwords)

# Apply the remove_stopwords function to 'summary' column in the training dataset
train_df['summary'] = train_df['summary'].apply(remove_stopwords)

# Display the updated training dataset
train_df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,...,category_sports,category_toys,rating,datetime,hour,weekday,month,year,category_no,combined_text
0,1,0,"11 12, 2016",C413C78E10E54C5DB41837889F36C1E8,565D194F38B1CC3F806EE677C61F639C,465E154EC79AFFAB5EB2607198B21433,reviews product fake.,"fake reviews, beware.",1478908800,2.0,...,0,0,0,2016-11-12,0,5,11,2016,1,all of the reviews for this product are fake. ...
1,1,1,"12 6, 2016",490AE37808EFEE3AF4FE6DEBDEB5A4C8,0D66512A0A7F580523AB996378DF0F14,760C63E8E5E8DC3FAA01878D37BA5678,wrong part. fault.,One Star,1480982400,,...,0,0,0,2016-12-06,0,1,12,2016,1,wrong part. our fault. One Star
2,1,1,"09 17, 2014",74A9FA5A64449BEE2A2E8E3F62872F0F,A0E45600FF2C5A779CB4314F379C253A,C6E4DD5C1C4EC09E90182644ED6CA9EF,wire set really sucks!!!,One Star,1410912000,,...,0,0,0,2014-09-17,0,2,9,2014,1,this wire set it really sucks!!! One Star
3,1,1,"06 11, 2016",EB561158A2829D98B467FE03CC1E45F1,37AB9A82470595E0ACB88BAC48C150EE,F4892A77EA45C52F40AB17ED537EF9FF,"first use, leaked instantly. even 5 bucks wort...",One Star,1465603200,,...,0,0,0,2016-06-11,0,5,6,2016,1,"first use, it leaked instantly. even at 5 buck..."
4,1,1,"12 23, 2017",5045D801332850D21618DD13A697CD9B,5772FF30428EEB8E0258C1A53CA2EC50,522F0BBFF2B47F1D63FF781A0AB1D079,fit,One Star,1513987200,,...,0,0,0,2017-12-23,0,5,12,2017,1,didn't fit One Star


*Using Column Trasformer and TfidVectorizer to convert the words to vectors that are further condesed so that they can be used for binary classification*

*Logistic Regression with 5 fold Cross Validation*

In [None]:
# Import necessary libraries
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import cross_val_predict, StratifiedKFold
import numpy as np

# Fill NaN values in 'reviewText' and 'summary' columns with empty strings in the training dataset
train_df['reviewText'] = train_df['reviewText'].fillna('')
train_df['summary'] = train_df['summary'].fillna('')

# Select features and target variable
X = train_df[['reviewText', 'summary', 'verified', 'weekday']]
y = train_df['rating']

# Initialize Logistic Regression model
model = LogisticRegression(max_iter=1000)

# Initialize TfidfVectorizer for 'reviewText' and 'summary' columns
vectorizer1 = TfidfVectorizer()
vectorizer2 = TfidfVectorizer()

# Create ColumnTransformer to apply TfidfVectorizer to text columns and keep other columns as they are
column_transformer = ColumnTransformer([
    ('tfidf1', vectorizer1, 'reviewText'),
    ('tfidf2', vectorizer2, 'summary')],
    remainder='passthrough')

# Create a pipeline with TfidfVectorizer and Logistic Regression
pipe = Pipeline([
    ('tfidf', column_transformer),
    ('classify', model)
])

# Perform 5-fold cross-validation
y_pred = cross_val_predict(pipe, X, y, cv=5)

# Calculate and print evaluation metrics for each fold
for fold, (train_idx, test_idx) in enumerate(StratifiedKFold(n_splits=5, random_state=42, shuffle=True).split(X, y), 1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    # Calculate evaluation metrics
    f1 = f1_score(y_test, y_pred, average='weighted')
    auc = roc_auc_score(y_test, pipe.predict_proba(X_test)[:, 1])
    conf_matrix = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    # Print evaluation metrics for each fold
    print(f"Fold {fold}: F1 Score - {f1}, AUC - {auc}, Accuracy - {acc}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("---------------")


Fold 1: F1 Score - 0.850130740774544, AUC - 0.9317798903533203, Accuracy - 0.8516615279205207
Confusion Matrix:
[[3233  323]
 [ 543 1739]]
---------------
Fold 2: F1 Score - 0.850754275825502, AUC - 0.9251632081265915, Accuracy - 0.8521754025351148
Confusion Matrix:
[[3228  328]
 [ 535 1747]]
---------------
Fold 3: F1 Score - 0.8396596523093816, AUC - 0.9177469983210906, Accuracy - 0.841897910243234
Confusion Matrix:
[[3238  318]
 [ 605 1677]]
---------------
Fold 4: F1 Score - 0.8438103121089414, AUC - 0.9247964376825468, Accuracy - 0.8461801986981843
Confusion Matrix:
[[3260  295]
 [ 603 1680]]
---------------
Fold 5: F1 Score - 0.8432155451783846, AUC - 0.926003912475917, Accuracy - 0.8451259208497516
Confusion Matrix:
[[3231  324]
 [ 580 1702]]
---------------


*5 folds for 3 other models - Decision Tree, Random Forest and XG Boost*

In [None]:
# Import necessary libraries for the new models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Decision Tree model and pipeline
model_tree = DecisionTreeClassifier()
pipe_tree = Pipeline([
    ('tfidf', column_transformer),
    ('classify', model_tree)
])

# Random Forest model and pipeline
model_forest = RandomForestClassifier()
pipe_forest = Pipeline([
    ('tfidf', column_transformer),
    ('classify', model_forest)
])

# XGBoost model and pipeline
model_xgb = XGBClassifier()
pipe_xgb = Pipeline([
    ('tfidf', column_transformer),
    ('classify', model_xgb)
])

# Models and Pipelines in a list for iteration
models = [(pipe_tree, "Decision Tree"), (pipe_forest, "Random Forest"), (pipe_xgb, "XGBoost")]

# Iterate over each model
for model, name in models:
    y_pred = cross_val_predict(model, X, y, cv=5)
    print(f"Model: {name}")
    for fold, (train_idx, test_idx) in enumerate(StratifiedKFold(n_splits=5, random_state=42, shuffle=True).split(X, y), 1):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        # Calculate evaluation metrics
        f1 = f1_score(y_test, y_pred, average='weighted')
        auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
        conf_matrix = confusion_matrix(y_test, y_pred)
        acc = accuracy_score(y_test, y_pred)
        # Print evaluation metrics for each fold
        print(f"Fold {fold}: F1 Score - {f1}, AUC - {auc}, Accuracy - {acc}")
        print("  Confusion Matrix:")
        print(conf_matrix)
        print("  ---------------")
    print("===================================")


Model: Decision Tree
Fold 1: F1 Score - 0.7340857956228601, AUC - 0.7237081369430047, Accuracy - 0.7331277834874957
  Confusion Matrix:
[[2727  829]
 [ 729 1553]]
  ---------------
Fold 2: F1 Score - 0.7444458650675987, AUC - 0.7332728922688345, Accuracy - 0.7439191503939705
  Confusion Matrix:
[[2781  775]
 [ 720 1562]]
  ---------------
Fold 3: F1 Score - 0.7284623370707004, AUC - 0.7178121139765504, Accuracy - 0.7274751627269613
  Confusion Matrix:
[[2710  846]
 [ 745 1537]]
  ---------------
Fold 4: F1 Score - 0.7395343624961754, AUC - 0.7272413417093136, Accuracy - 0.7392942788626242
  Confusion Matrix:
[[2782  773]
 [ 749 1534]]
  ---------------
Fold 5: F1 Score - 0.7329635314623155, AUC - 0.720742717112213, Accuracy - 0.7325681000513963
  Confusion Matrix:
[[2755  800]
 [ 761 1521]]
  ---------------
Model: Random Forest
Fold 1: F1 Score - 0.8116049160193665, AUC - 0.9161929843673134, Accuracy - 0.8191161356628982
  Confusion Matrix:
[[3355  201]
 [ 855 1427]]
  ---------------

As we can see in the 5 fold cross validation with 3 other classification models - Decision trees, Random Forest and XG Boost

*Sentiment Analysis*

In [None]:
# Import necessary libraries
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd

# Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Calculate sentiment scores for 'reviewText' in the training and test datasets
train_df['reviewText_sentiment'] = train_df['reviewText'].apply(lambda x: sia.polarity_scores(x))
test_df['reviewText_sentiment'] = test_df['reviewText'].apply(lambda x: sia.polarity_scores(x))

# Calculate sentiment scores for 'summary' in the training and test datasets
# Handle NaN values in 'summary' for the test dataset
train_df['summary_sentiment'] = train_df['summary'].apply(lambda x: sia.polarity_scores(x))
test_df['summary_sentiment'] = test_df['summary'].apply(lambda x: sia.polarity_scores(x) if not pd.isnull(x) else np.nan)

In [None]:
# Split 'reviewText_sentiment' into separate columns for negative, neutral, and positive scores
train_df['reviewText_negative'] = train_df['reviewText_sentiment'].apply(lambda x: x['neg'])
train_df['reviewText_neutral'] = train_df['reviewText_sentiment'].apply(lambda x: x['neu'])
train_df['reviewText_positive'] = train_df['reviewText_sentiment'].apply(lambda x: x['pos'])

test_df['reviewText_negative'] = test_df['reviewText_sentiment'].apply(lambda x: x['neg'])
test_df['reviewText_neutral'] = test_df['reviewText_sentiment'].apply(lambda x: x['neu'])
test_df['reviewText_positive'] = test_df['reviewText_sentiment'].apply(lambda x: x['pos'])

# Split 'summary_sentiment' into separate columns for negative, neutral, and positive scores
train_df['summary_negative'] = train_df['summary_sentiment'].apply(lambda x: x['neg'])
train_df['summary_neutral'] = train_df['summary_sentiment'].apply(lambda x: x['neu'])
train_df['summary_positive'] = train_df['summary_sentiment'].apply(lambda x: x['pos'])

# Extract sentiment scores while replacing NaN with 0 for the test dataset
test_df['summary_negative'] = test_df['summary_sentiment'].apply(lambda x: x['neg'] if not pd.isnull(x) else 0)
test_df['summary_neutral'] = test_df['summary_sentiment'].apply(lambda x: x['neu'] if not pd.isnull(x) else 0)
test_df['summary_positive'] = test_df['summary_sentiment'].apply(lambda x: x['pos'] if not pd.isnull(x) else 0)

In [None]:
train_df.columns

Index(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin',
       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote',
       'image', 'style', 'category', 'id', 'category_CDs',
       'category_automotive', 'category_cell_phones', 'category_grocery',
       'category_sports', 'category_toys', 'rating', 'datetime', 'hour',
       'weekday', 'month', 'year', 'category_no', 'combined_text',
       'reviewText_sentiment', 'summary_sentiment', 'reviewText_negative',
       'reviewText_neutral', 'reviewText_positive', 'summary_negative',
       'summary_neutral', 'summary_positive'],
      dtype='object')

*Classification with only sentiment analysis scores*

In [None]:
# Import necessary libraries and modules
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Set X and y with the numeric columns
X = train_df[['reviewText_negative', 'reviewText_positive', 'summary_negative', 'summary_positive']]
y = train_df['rating']

# Split the data into training and testing sets (one fold)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define the Logistic Regression model
model = LogisticRegression(max_iter=1000)

# Standardize numeric features if necessary (only needed if features are on different scales)
numeric_features = X.columns.tolist()
preprocessor = ColumnTransformer(
    transformers=[('num', StandardScaler(), numeric_features)],
    remainder='passthrough'
)

# Fit the model on the training data
pipe = Pipeline(steps=[('preprocessor', preprocessor), ('classify', model)])
pipe.fit(X_train, y_train)

# Predict on the test data
y_pred = pipe.predict(X_test)

# Calculate and print evaluation metrics for the single fold
f1 = f1_score(y_test, y_pred, average='weighted')
auc = roc_auc_score(y_test, pipe.predict_proba(X_test)[:, 1])
acc = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
print(f"F1 Score - {f1}, AUC - {auc}, Accuracy - {acc}")
print("Confusion Matrix:")
print(conf_matrix)

F1 Score - 0.7107456199447986, AUC - 0.7762961761681632, Accuracy - 0.7158273381294964
Confusion Matrix:
[[2900  656]
 [1003 1279]]


In [None]:
# Apply remove_stopwords function to 'reviewText' column in test_df
test_df['reviewText'] = test_df['reviewText'].apply(remove_stopwords)

# Fill NaN with empty string in text columns
test_df['summary'] = test_df['summary'].fillna('')
test_df['reviewText'] = test_df['reviewText'].fillna('')

# Replace NaN with 0 in numeric columns
numeric_columns = ['reviewText_negative', 'reviewText_neutral', 'reviewText_positive', 'summary_negative', 'summary_neutral', 'summary_positive']
test_df[numeric_columns] = test_df[numeric_columns].fillna(0)

# Perform prediction using the previously defined pipeline 'pipe'
predicted_classes = pipe.predict(test_df[['summary', 'reviewText', 'reviewText_negative', 'reviewText_neutral', 'reviewText_positive', 'summary_negative', 'summary_neutral', 'summary_positive']])

# Combine IDs with predicted classifications
results = pd.DataFrame({'id': test_df['id'], 'preds': predicted_classes})

# Display the results
print(results)

# Save results to a CSV file
results.to_csv("sample_data/results_withoutsenti.csv", index=False)

         id  preds
0        a0      0
1        a1      0
2        a2      0
3        a3      0
4        a4      0
...     ...    ...
4495  a4495      0
4496  a4496      0
4497  a4497      1
4498  a4498      0
4499  a4499      0

[4500 rows x 2 columns]


In [None]:
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Decision Tree
model_tree = DecisionTreeClassifier()
pipe_tree = Pipeline(steps=[('preprocessor', preprocessor), ('classify', model_tree)])
pipe_tree.fit(X_train, y_train)
y_pred_tree = pipe_tree.predict(X_test)
f1_tree = f1_score(y_test, y_pred_tree, average='weighted')
auc_tree = roc_auc_score(y_test, pipe_tree.predict_proba(X_test)[:, 1])
conf_matrix_tree = confusion_matrix(y_test, y_pred_tree)
acc_tree = accuracy_score(y_test, y_pred_tree)

print("Decision Tree:")
print(f"F1 Score - {f1_tree}, AUC - {auc_tree}, Accuracy - {acc_tree}")
print("Confusion Matrix:")
print(conf_matrix_tree)

# Random Forest
model_forest = RandomForestClassifier()
pipe_forest = Pipeline(steps=[('preprocessor', preprocessor), ('classify', model_forest)])
pipe_forest.fit(X_train, y_train)
y_pred_forest = pipe_forest.predict(X_test)
f1_forest = f1_score(y_test, y_pred_forest, average='weighted')
auc_forest = roc_auc_score(y_test, pipe_forest.predict_proba(X_test)[:, 1])
conf_matrix_forest = confusion_matrix(y_test, y_pred_forest)
acc_forest = accuracy_score(y_test, y_pred_forest)

print("Random Forest:")
print(f"F1 Score - {f1_forest}, AUC - {auc_forest}, Accuracy - {acc_forest}")
print("Confusion Matrix:")
print(conf_matrix_forest)

# XGBoost
model_xgb = XGBClassifier()
pipe_xgb = Pipeline(steps=[('preprocessor', preprocessor), ('classify', model_xgb)])
pipe_xgb.fit(X_train, y_train)
y_pred_xgb = pipe_xgb.predict(X_test)
f1_xgb = f1_score(y_test, y_pred_xgb, average='weighted')
auc_xgb = roc_auc_score(y_test, pipe_xgb.predict_proba(X_test)[:, 1])
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)
acc_xgb = accuracy_score(y_test, y_pred_xgb)

print("XGBoost:")
print(f"F1 Score - {f1_xgb}, AUC - {auc_xgb}, Accuracy - {acc_xgb}")
print("Confusion Matrix:")
print(conf_matrix_xgb)

Decision Tree:
F1 Score - 0.6654332605016403, AUC - 0.6504702769953932, Accuracy - 0.6670092497430626
Confusion Matrix:
[[2639  917]
 [1027 1255]]
Random Forest:
F1 Score - 0.6961737953675177, AUC - 0.7443556778781268, Accuracy - 0.6992120589242892
Confusion Matrix:
[[2786  770]
 [ 986 1296]]
XGBoost:
F1 Score - 0.7205812930926203, AUC - 0.778835243096804, Accuracy - 0.722850291195615
Confusion Matrix:
[[2837  719]
 [ 899 1383]]



*Classification with both Vectorization and sentiment analysis*

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Set X and y
X = train_df[['summary','reviewText','reviewText_negative', 'reviewText_neutral', 'reviewText_positive', 'summary_negative', 'summary_neutral', 'summary_positive']]
y = train_df['rating']

# Split the data into training and testing sets (one fold)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Decision Tree
model_tree = DecisionTreeClassifier()
pipe_tree = Pipeline(steps=[('tfidf', column_transformer), ('classify', model_tree)])
pipe_tree.fit(X_train, y_train)
y_pred_tree = pipe_tree.predict(X_test)
f1_tree = f1_score(y_test, y_pred_tree, average='weighted')
auc_tree = roc_auc_score(y_test, pipe_tree.predict_proba(X_test)[:, 1])
conf_matrix_tree = confusion_matrix(y_test, y_pred_tree)
acc_tree = accuracy_score(y_test, y_pred_tree)

print("Decision Tree:")
print(f"F1 Score - {f1_tree}, AUC - {auc_tree}, Accuracy - {acc_tree}")
print("Confusion Matrix:")
print(conf_matrix_tree)

# Random Forest
model_forest = RandomForestClassifier()
pipe_forest = Pipeline(steps=[('tfidf', column_transformer), ('classify', model_forest)])
pipe_forest.fit(X_train, y_train)
y_pred_forest = pipe_forest.predict(X_test)
f1_forest = f1_score(y_test, y_pred_forest, average='weighted')
auc_forest = roc_auc_score(y_test, pipe_forest.predict_proba(X_test)[:, 1])
conf_matrix_forest = confusion_matrix(y_test, y_pred_forest)
acc_forest = accuracy_score(y_test, y_pred_forest)

print("Random Forest:")
print(f"F1 Score - {f1_forest}, AUC - {auc_forest}, Accuracy - {acc_forest}")
print("Confusion Matrix:")
print(conf_matrix_forest)

# XGBoost
model_xgb = XGBClassifier()
pipe_xgb = Pipeline(steps=[('tfidf', column_transformer), ('classify', model_xgb)])
pipe_xgb.fit(X_train, y_train)
y_pred_xgb = pipe_xgb.predict(X_test)
f1_xgb = f1_score(y_test, y_pred_xgb, average='weighted')
auc_xgb = roc_auc_score(y_test, pipe_xgb.predict_proba(X_test)[:, 1])
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)
acc_xgb = accuracy_score(y_test, y_pred_xgb)

print("XGBoost:")
print(f"F1 Score - {f1_xgb}, AUC - {auc_xgb}, Accuracy - {acc_xgb}")
print("Confusion Matrix:")
print(conf_matrix_xgb)

Decision Tree:
F1 Score - 0.7480976634803119, AUC - 0.7369311499296594, Accuracy - 0.7476875642343268
Confusion Matrix:
[[2798  758]
 [ 715 1567]]
Random Forest:
F1 Score - 0.8077822411470889, AUC - 0.9066680328959756, Accuracy - 0.8146625556697499
Confusion Matrix:
[[3317  239]
 [ 843 1439]]
XGBoost:
F1 Score - 0.8265074188593577, AUC - 0.9116947791144863, Accuracy - 0.828023295649195
Confusion Matrix:
[[3150  406]
 [ 598 1684]]


In [None]:
from sklearn.model_selection import train_test_split

# Initialize model and vectorizers
model = LogisticRegression(max_iter=1000)
vectorizer1 = TfidfVectorizer()
vectorizer2 = TfidfVectorizer()

# Construct the column transformer
column_transformer = ColumnTransformer([
    ('tfidf1', vectorizer1, 'reviewText'),
    ('tfidf2', vectorizer2, 'summary')],
    remainder='passthrough')

# Create the pipeline
pipe = Pipeline([
    ('tfidf', column_transformer),
    ('classify', model)
])

# Fit the model on the training data
pipe.fit(X_train, y_train)

# Predict on the test data
y_pred = pipe.predict(X_test)

# Calculate and print evaluation metrics for the single fold
f1 = f1_score(y_test, y_pred, average='weighted')
auc = roc_auc_score(y_test, pipe.predict_proba(X_test)[:, 1])
conf_matrix = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
print(f"F1 Score - {f1}, AUC - {auc}, Accuracy - {acc}")
print("Confusion Matrix:")
print(conf_matrix)


F1 Score - 0.8432280493557215, AUC - 0.9254914358864651, Accuracy - 0.8446385748544022
Confusion Matrix:
[[3201  355]
 [ 552 1730]]


Predicting values from the test data frame

*Preprocess test data*

In [None]:
# Apply stopword removal to 'reviewText' column
test_df['reviewText'] = test_df['reviewText'].apply(remove_stopwords)

# Fill NaN with empty string in text columns
test_df['summary'] = test_df['summary'].fillna('')
test_df['reviewText'] = test_df['reviewText'].fillna('')

# Replace NaN with 0 in numeric columns
numeric_columns = ['reviewText_negative', 'reviewText_neutral', 'reviewText_positive', 'summary_negative', 'summary_neutral', 'summary_positive']
test_df[numeric_columns] = test_df[numeric_columns].fillna(0)

# Perform prediction
predicted_classes = pipe.predict(test_df[['summary', 'reviewText', 'reviewText_negative', 'reviewText_neutral', 'reviewText_positive', 'summary_negative', 'summary_neutral', 'summary_positive']])

# Combine IDs with predicted classifications
results = pd.DataFrame({'id': test_df['id'], 'preds': predicted_classes})

# Display the results
print(results)

         id  preds
0        a0      0
1        a1      0
2        a2      0
3        a3      0
4        a4      0
...     ...    ...
4495  a4495      1
4496  a4496      1
4497  a4497      1
4498  a4498      1
4499  a4499      1

[4500 rows x 2 columns]


In [None]:
# Save the results to a CSV file
results.to_csv("sample_data/results.csv", index=False)


In [None]:
train_df[['id','summary', 'reviewText', 'reviewText_negative', 'reviewText_neutral', 'reviewText_positive', 'summary_negative', 'summary_neutral', 'summary_positive']].head(120)

Unnamed: 0,id,summary,reviewText,reviewText_negative,reviewText_neutral,reviewText_positive,summary_negative,summary_neutral,summary_positive
0,ab0,"fake reviews, beware.",reviews product fake.,0.608,0.392,0.000,0.608,0.392,0.000
1,ab1,One Star,wrong part. fault.,0.853,0.147,0.000,0.000,1.000,0.000
2,ab2,One Star,wire set really sucks!!!,0.550,0.450,0.000,0.000,1.000,0.000
3,ab3,One Star,"first use, leaked instantly. even 5 bucks wort...",0.225,0.588,0.186,0.000,1.000,0.000
4,ab4,One Star,fit,0.000,0.000,1.000,0.000,1.000,0.000
...,...,...,...,...,...,...,...,...,...
115,ab115,Fit 1157 Applications,"bulb holder/insert car would accept bulb, flar...",0.000,0.852,0.148,0.000,0.444,0.556
116,ab116,ultra durable advertised,seam separated sleeve minor fall ( less 5 mph ...,0.161,0.678,0.161,0.000,1.000,0.000
117,ab117,Quantity 12pc Advertised,"unhappy received. ordered ""bulk 12"" expected r...",0.196,0.804,0.000,0.000,1.000,0.000
118,ab118,One Star,cheap chinese junk.,0.000,1.000,0.000,0.000,1.000,0.000
