# Task for Today  

***

## Car Review Rating Prediction  
  
Given *reviews of various cars*, let's try to predict the **rating** of a given car.  
  
We will use a TensorFlow/Keras neural network to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

import re
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import tensorflow as tf

from sklearn.metrics import r2_score

In [None]:
data = pd.read_csv('../input/edmunds-car-review/Review.csv')

In [None]:
data

In [None]:
data = data.sample(frac=0.3, random_state=1).reset_index(drop=True)

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
data.isna().sum()

In [None]:
def process_texts(texts):
    texts = texts.copy()
    stop_words = stopwords.words('english')
    
    texts = texts.apply(lambda x: re.sub(r'\\n', ' ', x))
    texts = texts.apply(lambda x: re.sub(r'\d+', '', x))
    texts = texts.apply(lambda x: " ".join([word for word in x.split() if word.lower() not in stop_words]))
    
    return texts


def get_sequences(texts):
    
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(texts)
    
    sequences = tokenizer.texts_to_sequences(texts)
    
    max_seq_length = np.max(list(map(lambda x: len(x), sequences)))
    print("Max sequence length:", max_seq_length)
    
    sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')
    
    return sequences


def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df


def encode_dates(df, column):
    df = df.copy()
    df[column] = pd.to_datetime(df[column], errors='coerce')
    df['ReviewYear'] = df[column].apply(lambda x: x.year)
    df['ReviewMonth'] = df[column].apply(lambda x: x.month)
    df['ReviewDay'] = df[column].apply(lambda x: x.day)
    df = df.drop(column, axis=1)
    return df

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop Reviewer column
    df = df.drop('Reviewer', axis=1)
    
    # Remove rows without reviews
    missing_reviews = df.loc[df['Review'].isna(), :].index
    df = df.drop(missing_reviews, axis=0).reset_index(drop=True)
    
    # Fill missing titles with an empty string
    df['Title'] = df['Title'].fillna("")
    
    # Process Title and Review columns
    df['Title'] = process_texts(df['Title'])
    df['Review'] = process_texts(df['Review'])
    
    # Get title and review sequences and drop original columns
    titles = get_sequences(df['Title'])
    reviews = get_sequences(df['Review'])
    df = df.drop(['Title', 'Review'], axis=1)
    
    # Extract date features
    df = encode_dates(df, column='Date')
    
    # Fill missing date values with column means
    for column in ['ReviewYear', 'ReviewMonth', 'ReviewDay']:
        df[column] = df[column].fillna(df[column].mean())
    
    # One-hot encode Company and Model columns
    df = onehot_encode(df, column='Company')
    df = onehot_encode(df, column='Model')
    
    # Split df into X and y
    y = df['Rating']
    X = df.drop('Rating', axis=1)
    
    # Train-test split
    titles_train, titles_test,\
        reviews_train, reviews_test,\
        X_train, X_test,\
        y_train, y_test = train_test_split(titles, reviews, X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X.columns)
    
    return  titles_train, titles_test, reviews_train, reviews_test, X_train, X_test, y_train, y_test

In [None]:
titles_train, titles_test, reviews_train, reviews_test, X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [None]:
titles_train.shape

In [None]:
reviews_train.shape

In [None]:
X_train.shape

In [None]:
y_train.shape

# Modeling

In [None]:
X_inputs = tf.keras.Input(shape=(X_train.shape[1],), name='X_inputs')
dense1 = tf.keras.layers.Dense(64, activation='relu', name='dense1')(X_inputs)
dense2 = tf.keras.layers.Dense(64, activation='relu', name='dense2')(dense1)


title_inputs = tf.keras.Input(shape=(titles_train.shape[1],), name='title_inputs')
title_embedding = tf.keras.layers.Embedding(
    input_dim=5000,
    output_dim=64,
    input_length=titles_train.shape[1],
    name='title_embedding'
)(title_inputs)
title_flatten = tf.keras.layers.Flatten(name='title_flatten')(title_embedding)


review_inputs = tf.keras.Input(shape=(reviews_train.shape[1],), name='review_inputs')
review_embedding = tf.keras.layers.Embedding(
    input_dim=5000,
    output_dim=64,
    input_length=reviews_train.shape[1],
    name='review_embedding'
)(review_inputs)
review_flatten = tf.keras.layers.Flatten(name='review_flatten')(review_embedding)


concat = tf.keras.layers.concatenate([dense2, title_flatten, review_flatten], name='concatenate')

outputs = tf.keras.layers.Dense(1, activation='linear')(concat)


model = tf.keras.Model(inputs=[X_inputs, title_inputs, review_inputs], outputs=outputs)

print(model.summary())
tf.keras.utils.plot_model(model)

# Training

In [None]:
model.compile(
    optimizer='adam',
    loss='mse'
)

history = model.fit(
    [X_train, titles_train, reviews_train],
    y_train,
    validation_split=0.2,
    batch_size=32,
    epochs=100,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True
        )
    ]
)

# Results

In [None]:
loss = model.evaluate([X_test, titles_test, reviews_test], y_test, verbose=0)
r2 = r2_score(y_test, model.predict([X_test, titles_test, reviews_test], verbose=0))

print("     Test Loss: {:.5f}".format(loss))
print("Test R^2 Score: {:.5f}".format(r2))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/3LfPuT1-FL4