In [1]:
import scipy
import sklearn
import json
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from collections import Counter

# Download files, set up folder, put files into folder

In [4]:
# training data: ./train.tsv
# test data:     ./test.tsv

In [5]:
# For files with 3 cols. Label, title, and review. Filtering out just the label and review.
# Function to process a CSV file and convert it to TSV using review column
def convert_csv_to_tsv(input_csv, output_tsv):
    # Read the CSV file
    df = pd.read_csv('/Users/pacbook/Desktop/UNC/642 Data Mining/Homework - Assignment 2/csv, Code files/train_converted.tsv')

    # Print column names and first few rows to confirm structure
    print(f"\nProcessing {input_csv}")
    print("Column names:", df.columns.tolist())
    print("First few rows:")
    print(df.head())

    # Keep only the label (column 0) and review (column 2), drop title (column 1)
    df = df.iloc[:, [0, 2]]

    # Map the labels: 2 -> 1 (positive), 1 -> 0 (negative)
    df.iloc[:, 0] = df.iloc[:, 0].map({2: 1, 1: 0})

    # Rename the columns to match train.tsv format
    df.columns = ['label', 'review']

    # Save as TSV file with tab separator, without index
    df.to_csv(output_tsv, sep='\t', index=False, header=True)

    print(f"Conversion complete! File saved as {output_tsv}")
    print("First few rows of the converted data:")
    print(df.head())

# Paths to your files (adjust if needed)
train_csv = '/Users/pacbook/Desktop/UNC/642 Data Mining/Homework - 2/csv, Code files/gittrain.csv'
test_csv = '/Users/pacbook/Desktop/UNC/642 Data Mining/Homework - 2/csv, Code files/gittest.csv'
train_tsv = 'train_converted.tsv'
test_tsv = 'test_converted.tsv'

# Convert both files
convert_csv_to_tsv(train_csv, train_tsv)
convert_csv_to_tsv(test_csv, test_tsv)

ParserError: Error tokenizing data. C error: Expected 3 fields in line 3, saw 8


# Load training and test data

In [4]:
dataframe = pd.read_csv('/Users/pacbook/Desktop/UNC/642 Data Mining/Homework - Assignment 2/csv, Code files/train_converted.tsv', sep = '\t')
print(dataframe)

         label                                             review
0            1  I'm reading a lot of reviews saying that this ...
1            1  This soundtrack is my favorite music of all ti...
2            1  I truly like this soundtrack and I enjoy video...
3            1  If you've played the game, you know how divine...
4            1  I am quite sure any of you actually taking the...
...        ...                                                ...
3599994      0  The high chair looks great when it first comes...
3599995      0  I have used this highchair for 2 kids now and ...
3599996      0  We have a small house, and really wanted two o...
3599997      0  not sure what this book is supposed to be. It ...
3599998      1  I agree that every American should read this b...

[3599999 rows x 2 columns]


In [None]:
''' Ignore, Trial for a .csv file downloaded fro github.....
# Read the CSV file
df = pd.read_csv('/Users/pacbook/Desktop/UNC/642 Data Mining/Homework - 2/csv, Code files/gittrain.csv')

# Print the first few rows and column names to inspect the data
print("Column names:", df.columns.tolist())
print("First few rows:")
print(df.head())

# Assuming the first column is the label and second is the review text
# Adjust these based on what you see in the column names printed above
label_col = df.columns[0]  # First column
review_col = df.columns[1] # Second column

# Map the labels: 2 -> 1 (positive) and 1 -> 0 (negative)
# This assumes 2 is positive and 1 is negative - verify this from the sample
df[label_col] = df[label_col].map({2: 1, 1: 0})

# Rename columns to match train.tsv expected format
df = df.rename(columns={label_col: 'label', review_col: 'review'})

# Ensure the columns are in the right order (label first, then review)
df = df[['label', 'review']]

# Save as TSV file with tab separator, without index
df.to_csv('train_converted.tsv', sep='\t', index=False, header=True)

print("Conversion complete! File saved as 'train_converted.tsv'")
print("First few rows of the converted data:")
print(df.head())
'''

In [6]:
train_ratio = 0.8 # 80% for training, 20% for validation
random_seed = 100

train_dataframe = dataframe.sample(frac=train_ratio, random_state=random_seed)
valid_dataframe = dataframe.drop(train_dataframe.index)
print('training set size:', len(train_dataframe))
print('validation set size:', len(valid_dataframe))

training set size: 2879999
validation set size: 720000


In [10]:
test_dataframe = pd.read_csv('/Users/pacbook/Desktop/UNC/642 Data Mining/Homework - Assignment 2/csv, Code files/gittest_converted.tsv', sep = '\t')
print (test_dataframe)

        label                                             review
0           1  Despite the fact that I have only played a sma...
1           0  I bought this charger in Jul 2003 and it worke...
2           1  Check out Maha Energy's website. Their Powerex...
3           1  Reviewed quite a bit of the combo players and ...
4           0  I also began having the incorrect disc problem...
...       ...                                                ...
399994      0  We bought this Thomas for our son who is a hug...
399995      0  My son recieved this as a birthday gift 2 mont...
399996      0  I bought this toy for my son who loves the "Th...
399997      1  This is a compilation of a wide range of Mitfo...
399998      0  This DVD will be a disappointment if you get i...

[399999 rows x 2 columns]


# Try the trivial baseline: predict the majority label of the training set

In [13]:
Counter(train_dataframe['label'])                          ## Counts no. of 1's and 0's

Counter({1: 1440465, 0: 1439534})

In [15]:
# Looks like label 1 has slightly more counts than label 0 in training data
# So the 'majority guess' prediction is an array filled with 1s
majority_guess_pred = [1 for i in range(len(valid_dataframe))]
accuracy = accuracy_score(valid_dataframe['label'], majority_guess_pred)
print ('Majority guess accuracy:', accuracy)

Majority guess accuracy: 0.49935277777777776


In [17]:
# helper function: write out prediction values into a csv format file             '''Important'''
# params:
#     df: dataframe, where each row is a test example, with column 'id' as data id
#     pred: a list or 1-d array of prediction values
#     filepath: the output file path
# return:
#     None

def write_test_prediction(df, pred, filepath):
    with open(filepath, 'w') as outfile:
        outfile.write('{},{}\n'.format('id', 'label'))
        for index, row in df.iterrows():
            outfile.write('{},{}\n'.format(row['id'], pred[index]))
    print (len(df), 'predictions are written to', filepath)

In [19]:
# majority_guess_pred_test = [1 for i in range(len(test_dataframe))]
# write_test_prediction(test_dataframe, majority_guess_pred_test, './majority_guess.csv')

# Build feature extractor

## use all unigrams from training data as features

In [23]:
# Count vectorizer
vectorizer = CountVectorizer(ngram_range=(1, 2), max_df=0.95, min_df=2, binary=True)
vectorizer.fit(train_dataframe['review'])


In [None]:
# Ignore, I was just testing
#train_dataframe

# Extract feature vectors for training, validation, and test data 

In [24]:
# Count vectorizer
train_X = vectorizer.transform(train_dataframe['review'])
valid_X = vectorizer.transform(valid_dataframe['review'])
test_X = vectorizer.transform(test_dataframe['review'])
print (train_X.shape)
print (valid_X.shape)
print (test_X.shape)


(2879999, 6062265)
(720000, 6062265)
(399999, 6062265)


In [None]:
'''
# Using TFIDF vectorizer instead of count vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.95, min_df=2)
vectorizer.fit(train_dataframe['review'])

train_X = vectorizer.transform(train_dataframe['review'])
valid_X = vectorizer.transform(valid_dataframe['review'])
test_X = vectorizer.transform(test_dataframe['review'])
'''

In [25]:
train_y = train_dataframe['label'].to_numpy()
valid_y = valid_dataframe['label'].to_numpy()
print (train_y.shape)
print (valid_y.shape)

(2879999,)
(720000,)


## Use chi-square statistic to select a subset of features

In [27]:

num_features_to_select = 5000
feature_selector = SelectKBest(score_func = chi2, k = num_features_to_select)
feature_selector.fit(train_X, train_y)

# feature names
all_features = [feature for feature, index in sorted(vectorizer.vocabulary_.items(), key = lambda x: x[1])]
selected_features = feature_selector.get_feature_names_out(input_features = all_features)


In [None]:
'''
# Chatgpt's PMI:
import numpy as np
from collections import defaultdict

# Compute word frequencies
def get_word_frequencies(dataframe):
    word_freq = defaultdict(int)
    total_words = 0
    for review in dataframe['review']:
        words = review.split()
        total_words += len(words)
        for word in words:
            word_freq[word] += 1
    return word_freq, total_words

# Compute class-specific word frequencies
def get_class_word_frequencies(dataframe, label):
    word_freq = defaultdict(int)
    total_words = 0
    for _, row in dataframe.iterrows():
        if row['label'] == label:
            words = row['review'].split()
            total_words += len(words)
            for word in words:
                word_freq[word] += 1
    return word_freq, total_words

# Compute PMI for each word
def compute_pmi(train_dataframe, num_features_to_select):
    word_freq, total_words = get_word_frequencies(train_dataframe)
    pos_word_freq, total_pos_words = get_class_word_frequencies(train_dataframe, 1)
    neg_word_freq, total_neg_words = get_class_word_frequencies(train_dataframe, 0)

    p_word = {word: freq / total_words for word, freq in word_freq.items()}

    pmi_scores = {}

    for word in word_freq:
        if word in pos_word_freq and word in neg_word_freq:  # Ensure word exists in both classes
            p_w_given_pos = pos_word_freq[word] / total_pos_words if word in pos_word_freq else 1e-10
            p_w_given_neg = neg_word_freq[word] / total_neg_words if word in neg_word_freq else 1e-10

            pmi_pos = np.log2(p_w_given_pos / p_word[word]) if p_word[word] > 0 else 0
            pmi_neg = np.log2(p_w_given_neg / p_word[word]) if p_word[word] > 0 else 0

            pmi_scores[word] = abs(pmi_pos - pmi_neg)

    # Select top N words based on PMI score
    selected_features = sorted(pmi_scores, key=pmi_scores.get, reverse=True)[:num_features_to_select]
    return set(selected_features)
'''

In [None]:
'''
num_features_to_select = 5000
feature_set = compute_pmi(train_dataframe, num_features_to_select)

vectorizer = CountVectorizer(vocabulary=feature_set)
vectorizer.fit(train_dataframe['review'])
'''

In [28]:
train_X_selected = feature_selector.transform(train_X)
valid_X_selected = feature_selector.transform(valid_X)
test_X_selected = feature_selector.transform(test_X)
print (train_X_selected.shape)
print (valid_X_selected.shape)
print (test_X_selected.shape)
'''print(train_X_selected.head())
print (valid_X_selected.head)
print (test_X_selected.head())'''

(2879999, 5000)
(720000, 5000)
(399999, 5000)


'print(train_X_selected.head())\nprint (valid_X_selected.head)\nprint (test_X_selected.head())'

In [None]:
'''
train_X_selected = vectorizer.transform(train_dataframe['review'])
valid_X_selected = vectorizer.transform(valid_dataframe['review'])
test_X_selected = vectorizer.transform(test_dataframe['review'])
'''

In [None]:
# Ignore, I was just testing
# print(train_X)

In [29]:
print(valid_X_selected[10])
# print (test_X_selected.shape)

  (0, 136)	1
  (0, 137)	1
  (0, 198)	1
  (0, 202)	1
  (0, 612)	1
  (0, 613)	1
  (0, 631)	1
  (0, 890)	1
  (0, 891)	1
  (0, 954)	1
  (0, 1613)	1
  (0, 1657)	1
  (0, 1691)	1
  (0, 1707)	1
  (0, 1969)	1
  (0, 2020)	1
  (0, 2051)	1
  (0, 2126)	1
  (0, 2422)	1
  (0, 2463)	1
  (0, 2660)	1
  (0, 3062)	1
  (0, 3111)	1
  (0, 3791)	1
  (0, 3916)	1
  (0, 3931)	1
  (0, 4010)	1
  (0, 4100)	1
  (0, 4239)	1
  (0, 4344)	1
  (0, 4770)	1
  (0, 4774)	1


# Train model on training set

In [None]:
'''
# Logistic Regression - 90%
model = LogisticRegression(C = 13, solver='liblinear')
model.fit(train_X_selected, train_y)
'''

In [None]:
''' SVM - 88%
# My own code, for 
# Asked chatgpt for help in implementing the code for SVM
from sklearn.svm import SVC

# Define and train the SVM model
model_svm = SVC(C=1.0, kernel='linear')  # You can change the kernel to 'rbf', 'poly', or 'sigmoid' if needed
model_svm.fit(train_X_selected, train_y)
train_y_hat = model_svm.predict(train_X_selected)
accuracy = accuracy_score(train_y, train_y_hat)
print('SVMs accuracy on training set:', accuracy)
'''

In [None]:
''' Decision tree with 1000 features - 75%
from sklearn.tree import DecisionTreeClassifier

model_dt = DecisionTreeClassifier(max_depth=28, random_state=42)  # You can tune max_depth
model_dt.fit(train_X_selected, train_y)

valid_y_hat = model_dt.predict(valid_X_selected)
accuracy = accuracy_score(valid_y, valid_y_hat)
print(f"Decision Tree accuracy: {accuracy}")
'''

In [None]:
''''
from xgboost import XGBClassifier

# Define XGBoost model
''
model_xgb = XGBClassifier(
    n_estimators=600,        # Increase to compensate for lower learning rate
    learning_rate=0.03,      # Reduce even further for better generalization
    max_depth=6,             # Slightly deeper trees
    min_child_weight=5,      # Require more samples per split (reduces overfitting)
    subsample=0.75,          # Use 75% of data per tree (adds randomness)
    colsample_bytree=0.75,   # Use 75% of features per tree (reduces overfitting)
    gamma=0.2,               # Higher gamma to prevent unnecessary splits
    reg_lambda=2,  # Correct parameter for L2 regularization,               # Stronger L2 regularization
    alpha=1,                 # Adds L1 regularization
    random_state=42
)

model_xgb = XGBClassifier(
    n_estimators=300, learning_rate=0.05, max_depth=4, 
    min_child_weight=5, subsample=0.8, colsample_bytree=0.8, gamma=0.2, 
    random_state=42
)''

model_xgb = XGBClassifier(n_estimators=300, learning_rate=0.1, max_depth=6, random_state=42)
# XGBoost accuracy on training set: 0.9373254990207109
# XGBoost accuracy on validation set: 0.8731455242540423


# Train the model
model_xgb.fit(train_X_selected, train_y)

# Evaluate on training set
train_y_hat = model_xgb.predict(train_X_selected)
train_accuracy = accuracy_score(train_y, train_y_hat)
print(f"XGBoost accuracy on training set: {train_accuracy}")

# Evaluate on validation set
valid_y_hat = model_xgb.predict(valid_X_selected)
valid_accuracy = accuracy_score(valid_y, valid_y_hat)
print(f"XGBoost accuracy on validation set: {valid_accuracy}")
'''


### Random Forests, Multinomial Naive_bayes, Lightgbm, 

In [36]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

print("--- Starting Benchmark ---")

models = {
    "Logistic Regression (Baseline)": LogisticRegression(C=12, solver='liblinear', random_state=42),
    
    "Multinomial Naive Bayes": MultinomialNB(),
    
    # Random Forest: An ensemble of decision trees. n_jobs=-1 is crucial for speed.
    "Random Forest": RandomForestClassifier(
        n_estimators=100, 
        max_depth=50,       # Controls complexity to prevent overfitting
        random_state=42, 
        n_jobs=-1           # Uses all available CPU cores for training
    ),
    
    # LightGBM: A state-of-the-art Gradient Boosting Machine, known for top performance.
    "LightGBM": lgb.LGBMClassifier(random_state=42)
}

# --- Run the benchmark ---
results = {}

# Re-using the feature matrices and labels from the cells above
# (train_X_selected, valid_X_selected, train_y, valid_y)
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train the model
    model.fit(train_X_selected, train_y)
    
    # Make predictions on the unseen validation set
    predictions = model.predict(valid_X_selected)
    
    # Calculate and store the accuracy
    accuracy = accuracy_score(valid_y, predictions)
    results[name] = accuracy
    print(f"{name} Validation Accuracy: {accuracy * 100:.4f}%")

# --- Final Results Summary ---
print("\n--- FINAL BENCHMARK RESULTS ---")
# Sort the results to find the best performing model
for name, acc in sorted(results.items(), key=lambda item: item[1], reverse=True):
    print(f"{name:<35} | Accuracy: {acc*100:.4f}%")

--- Starting Benchmark ---

Training Logistic Regression (Baseline)...
Logistic Regression (Baseline) Validation Accuracy: 90.3640%

Training Multinomial Naive Bayes...
Multinomial Naive Bayes Validation Accuracy: 87.4149%

Training Random Forest...
Random Forest Validation Accuracy: 83.5056%

Training LightGBM...


TypeError: Expected np.float32 or np.float64, met type(int64)

Training Logistic Regression (Baseline)...
Logistic Regression (Baseline) Validation Accuracy: 90.3640%

Training Multinomial Naive Bayes...
Multinomial Naive Bayes Validation Accuracy: 87.4149%

Training Random Forest...
Random Forest Validation Accuracy: 83.5056%

LightGBM (Gradient Boosting)        | Accuracy: 85.1663%

In [40]:
# Using float32 data type for lightgbm
# LightGBM requires the data to be in float format (e.g., float32 or float64).
# The CountVectorizer creates an integer matrix, so we convert it here.
print(">>> Converting data to float32 for LightGBM compatibility...")
train_X_selected = train_X_selected.astype('float32')
valid_X_selected = valid_X_selected.astype('float32')

models = {"LightGBM (Gradient Boosting)": lgb.LGBMClassifier(random_state=42)}

results = {}

print("\n--- Starting Model Training and Evaluation ---")
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(train_X_selected, train_y)
    predictions = model.predict(valid_X_selected)
    accuracy = accuracy_score(valid_y, predictions)
    results[name] = accuracy
    print(f"✅ {name} Validation Accuracy: {accuracy * 100:.4f}%")

# --- 3. Final Results Summary ---
print("\n--- FINAL BENCHMARK RESULTS ---")
for name, acc in sorted(results.items(), key=lambda item: item[1], reverse=True):
    print(f"{name:<35} | Accuracy: {acc*100:.4f}%")

>>> Converting data to float32 for LightGBM compatibility...

--- Starting Model Training and Evaluation ---

Training LightGBM (Gradient Boosting)...
[LightGBM] [Info] Number of positive: 1440465, number of negative: 1439534
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 5.439659 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10000
[LightGBM] [Info] Number of data points in the train set: 2879999, number of used features: 5000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500162 -> initscore=0.000647
[LightGBM] [Info] Start training from score 0.000647
✅ LightGBM (Gradient Boosting) Validation Accuracy: 85.1663%

--- FINAL BENCHMARK RESULTS ---
LightGBM (Gradient Boosting)        | Accuracy: 85.1663%


In [None]:
''' LSTM
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import numpy as np

# Tokenization and Padding
tokenizer = Tokenizer(num_words=3500, oov_token="<OOV>")
tokenizer.fit_on_texts(train_dataframe['review'])

train_sequences = tokenizer.texts_to_sequences(train_dataframe['review'])
valid_sequences = tokenizer.texts_to_sequences(valid_dataframe['review'])
test_sequences = tokenizer.texts_to_sequences(test_dataframe['review'])

max_length = 150  # Set max length for padding
train_X = pad_sequences(train_sequences, maxlen=max_length, padding="post")
valid_X = pad_sequences(valid_sequences, maxlen=max_length, padding="post")
test_X = pad_sequences(test_sequences, maxlen=max_length, padding="post")

train_y = np.array(train_dataframe['label'])
valid_y = np.array(valid_dataframe['label'])

# Define LSTM model
model_lstm = Sequential([
    Embedding(input_dim=3500, output_dim=128, input_length=max_length),
    LSTM(64, return_sequences=True, recurrent_dropout=0.2),  # Adds recurrent dropout
    Dropout(0.3),  # Increase dropout to prevent overfitting
    LSTM(32),
    Dropout(0.3),  # Added dropout here too
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])


# Compile the model
model_lstm.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
history = model_lstm.fit(
    train_X, train_y,
    validation_data=(valid_X, valid_y),
    epochs=6,
    batch_size=128
)

# Evaluate on validation set
valid_loss, valid_accuracy = model_lstm.evaluate(valid_X, valid_y)
print(f"LSTM accuracy on validation set: {valid_accuracy}")
'''

In [None]:

''' LSTM output
import os
import numpy as np

# 🚀 Predict on Test Data (Ensure Correct Shape)
test_y_hat = (model_lstm.predict(test_X) > 0.5).astype("int32").flatten()

# ✅ Debug: Check Sizes Before Saving
print(f"🔍 Predictions Generated: {len(test_y_hat)}")
print(f"🔍 Test Data Size: {len(test_dataframe)}")

# ✅ Fix Length Mismatch
if len(test_y_hat) > len(test_dataframe):
    test_y_hat = test_y_hat[:len(test_dataframe)]  # Trim excess
elif len(test_y_hat) < len(test_dataframe):
    raise ValueError(f"❌ ERROR: Predictions ({len(test_y_hat)}) are LESS than test data ({len(test_dataframe)})!")

# 🚀 Assign Predictions to DataFrame
test_dataframe['label'] = test_y_hat

# ✅ Set Save Path
save_path = "/Users/pacbook/Desktop/UNC/642 Data Mining/Homework - 2/csv_Code_Files/6000_lstm_predictions.csv"

# ✅ Ensure Save Directory Exists
os.makedirs(os.path.dirname(save_path), exist_ok=True)

# 🚀 Save Predictions to CSV
test_dataframe[['label']].to_csv(save_path, index=False)

print(f"✅ Predictions saved successfully to: {save_path}")
'''

# Evaluate model on training set

In [None]:
'''
train_y_hat = model.predict(train_X_selected)
accuracy = accuracy_score(train_y, train_y_hat)
print ('Logistic regression, accuracy on training set:', accuracy)
'''

In [None]:
# My own code, ignore
text = np.array(["Amazing sucks"])
transform1 = vectorizer.transform(text)
transform2 = feature_selector.transform(transform1)

In [None]:
array1 = transform1.toarray()
array2 = transform2.toarray()

In [None]:
transform1.toarray().size

In [None]:
non_zero_indices = np.nonzero(array2)
non_zero_indices

In [None]:
non_zero_values = array2[non_zero_indices]
non_zero_values

In [None]:
model.predict(transform2.reshape(1, -1))

# Evaluate model on validation set

In [None]:
'''
valid_y_hat = model.predict(valid_X_selected)
accuracy = accuracy_score(valid_y, valid_y_hat)
print ('Logistic regression, accuracy on validation set:', accuracy)
'''

In [None]:
'''
valid_y_hat = model_svm.predict(valid_X_selected)
accuracy = accuracy_score(valid_y, valid_y_hat)
print ('SVM accuracy on validation set:', accuracy)
'''

# After experimentation on the validation set: retrain the final model on all training data, and predict labels for test data

In [None]:
'''
all_train_X = vectorizer.transform(dataframe['review'])
all_train_X_selected = feature_selector.transform(all_train_X)
all_train_y = dataframe['label'].to_numpy()

model.fit(all_train_X_selected, all_train_y)
test_y_hat = model.predict(test_X_selected)
write_test_prediction(test_dataframe, test_y_hat, './logistic_regression.csv')
'''

In [None]:
'''
model_svm.fit(all_train_X_selected, all_train_y)
test_y_hat = model_svm.predict(test_X_selected)
write_test_prediction(test_dataframe, test_y_hat, './svm_predictions.csv')
'''

In [None]:
'''
test_y_hat = model_xgb.predict(test_X_selected)
write_test_prediction(test_dataframe, test_y_hat, './xgboost_predictions.csv')
'''

In [None]:
'''
test_y_hat = (model_lstm.predict(test_X) > 0.5).astype("int32")
write_test_prediction(test_dataframe, test_y_hat, './lstm_predictions.csv')
'''

In [None]:
# 🚀 Ensure Predictions are in Correct Format
test_y_hat = (model_lstm.predict(test_X) > 0.5).astype("int32").flatten()  # Flatten removes brackets

# 🚀 Save Predictions in Correct Format
write_test_prediction(test_dataframe, test_y_hat, './40000lstm_predictions.csv')

print("✅ Predictions saved in correct format!")


In [None]:
'''
import os
print(os.getcwd())  # Get current working directory
'''

# Investigate what the model has learned and where it failed (A.K.A. error analysis)

## Look at learned parameters (for linear model: weight of each dimension)

In [None]:
# construct a mapping: word -> learned weight of this word
feature_weight = {}
for idx, feature in enumerate(selected_features):
    feature_weight[feature] = model.coef_[0][idx]

In [None]:
# words correlated with positive sentiment (top ones)
for k, v in sorted(feature_weight.items(), key = lambda x: x[1], reverse = True)[:20]:
     print ('"{}"'.format(k), v)

In [None]:
# words correlated with negative sentiments (top ones)
for k, v in sorted(feature_weight.items(), key = lambda x: x[1], reverse = False)[:20]:
     print ('"{}"'.format(k), v)

## Look at how the model makes predictions on individual examples

In [None]:
# We pick a set of examples from the validation set (we predicted scores for those).
# We usually we don't pick from training data (since the good performance may be unrealistic).
# We cannot do error analysis on test data （because no true target value is provided）.

In [None]:
def explain_linear_prediction(df, model, idx2feature, X, y, y_hat, idx_list):
    print('indices:', idx_list)
    for idx in idx_list:
        print ('==============', idx, '================')
        print ('document:', df.iloc[idx]['review'])
        print ('TRUE label:', df.iloc[idx]['label'])
        print ('PRED label:', y_hat[idx])
        
        print ('\nPRED breakdown:')
        print ('\tINTERCEPT', model.intercept_)
        if X[idx, :].nnz == 0:
            print ('\tFEATURE', '[EMPTY]')
        else:
            sp_row = X[idx, :]
            for i in range(sp_row.getnnz()): # looping over a row in sparse matrix 
                feature_value = sp_row.data[i]
                feature_dim = sp_row.indices[i]
                print ('\tFEATURE', idx2feature[feature_dim], ':', feature_value, '*', model.coef_[0][feature_dim])

In [None]:
# construct a dictionary mapping: feature index -> word
idx2feature = dict([(idx, feature) for idx, feature in enumerate(selected_features)])

# look at data with prediction error
error_indices  = [i for i in range(len(valid_y_hat)) if valid_y_hat[i] != valid_y[i]]
explain_linear_prediction(valid_dataframe, model, idx2feature, valid_X_selected, valid_y, valid_y_hat, np.random.choice(error_indices, size = 1))