In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# NLP with Trip Advisor Hotel Reviews
## How to classify Text

Checkout my website [My NLP Blog](http://www.itman.solutions/en/blogs/nlp-with-trip-advisor-reviews)

First of all load data from kaggle trip advisor hotel reviews.
Load Data as pandas Datafram

In [None]:
# First load data

df = pd.read_csv("/kaggle/input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv")

Explore data with df.head() - shows first 5 rows.

In [None]:
print(df.head()

We can see that each row has a review and a rating.

Now we will start developing our "classification robot". To correctly score our "robot" we need to split our data in training and testing.
The training set will contain 15 000 rows and the test set about 5 000.


In [None]:
train_df = df.iloc[:][:15000]
test_df = df.iloc[:][15000:]

With training.describe() - we can get more information about the numerical columns in the dataframe.

In [None]:
print(train_df.describe())

The Ratings are from 1 to 5. The average( or mean) is 3.95. The Median is 4. 
The inter-quartile range is 3 - 5 and the standard deviation is 1.23.
We can also plot the distribution with a histogram.

In [None]:
ratings = train_df["Rating"]
bins = [1, 2, 3, 4, 5, 6]
plt.hist(ratings, bins, rwidth=0.8)
plt.savefig("histogram.png")

Therefore I used matplotlib. The histogram shows, that there are much more 5 star ratings. circa 6 000 5 star ratings from a total of 15 k ratings.
Now we can build a naive algorithm to classify the reviews. Our "robot" will always predict 5 stars.

In [None]:
amount_ratings = len(test_df)
robots_predictions = list([5 for i in range(amount_ratings)])

In [None]:
def accuracy(predictions, real):
    amount_correct = list(map(lambda x: x[0] == x[1], zip(predictions, real))).count(True)
    return amount_correct / len(predictions) 

In [None]:
accuracy(robots_predictions, list(test_df["Rating"]))

We achieved an accuracy of 48% this is pretty good. Our robot always predicted 5 stars, without knowing anything about the text written.
Now let's try improving our robot so he can predict better.

Therefore our robot will look at the text.

First of all we have to transform the text to words, we are going to use nltk for that. It is a very useful libary for NLP.

In [None]:
from nltk.tokenize import word_tokenize # we need to import nltk
reviews = list(train_df["Review"])
words = list(map(lambda review: word_tokenize(review), reviews))

In [None]:
reviews[0].count(",")

In [None]:
amount_words = list(map(lambda w: len(w), words))
amount_sentences = list(map(lambda w: len(w), sentences))
amount_commas = list(map(lambda w: w.count(","), reviews))

In [None]:
amount_commas[:10]

In [None]:
df["Amount Words"] = amount_words
df["Amount Sentences"] = amount_sentences
df["Commas"] = amount_commas

In [None]:
#df.drop("Exclamation Marks", axis=1, inplace=True)

In [None]:
df.head(10)

In [None]:
from matplotlib import pyplot as plt
import seaborn as sn

In [None]:
corrMatrix = df.corr()
fig, ax = plt.subplots(figsize= (15,10))
sn.heatmap(corrMatrix, annot=True)
plt.show()

In [None]:
words[0]

In [None]:
start_df = df.copy()

In [None]:
all_words = []
for ws in words:
    all_words.extend(ws)

In [None]:
all_unique_words = list(set(all_words))

In [None]:
word_sample = all_words[:50_000]
unique_words_sample = list(set(word_sample))
len(unique_words_sample)

In [None]:
count_word = {}
for i, word in enumerate(unique_words_sample):
    if i % 1_000 == 0:
        print(i)
    count_word[word] = word_sample.count(word)

In [None]:
most_used_words = list(map(lambda x: x[0],sorted(count_word.items(), key= lambda x: x[1], reverse=True)))

In [None]:
unique_words = list(map(lambda x: set(x), words))

In [None]:
word_sample = all_words[:50_000]
unique_words_sample = list(set(word_sample))
len(unique_words_sample)
count_word = {}
for i, word in enumerate(unique_words_sample):
    if i % 1_000 == 0:
        print(i)
    count_word[word] = word_sample.count(word)
most_used_words = list(map(lambda x: x[0],sorted(count_word.items(), key= lambda x: x[1], reverse=True)))
unique_words = list(map(lambda x: set(x), words))
important_words = most_used_words[:1_000]

In [None]:
for i, unique_word in enumerate(important_words):
    if i % 100 == 0:
        print(f"{i} / {len(important_words)}")
    df["amount " + unique_word] = list(map(lambda w: int(unique_word in w), unique_words))

# Could count words insteacd of 0 1

In [None]:
df.head()

In [None]:
df["Rating"] = df["Rating"].apply(lambda x: x-1)

In [None]:
df["Rating"].describe()

In [None]:
y = list(df["Rating"])

In [None]:
train_df = df.copy()

In [None]:
train_df.drop("Rating", axis=1, inplace=True)
train_df.drop("Review", axis=1, inplace=True)

In [None]:
train_df.head()

In [None]:
X = train_df.values

In [None]:
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.33)

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
y_predict = model.predict(X_test)

In [None]:
print(y_predict[:10])
print(y_test[:10])

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=10).fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
X_train

In [None]:
X_train = np.array(list(map(lambda x: np.array(x), X_train)))
y_train = np.array(y_train)
X_test = np.array(list(map(lambda x: np.array(x), X_test)))
y_test = np.array(y_test)

In [None]:
y_train

In [None]:
from tensorflow import keras
import tensorflow as tf
model = keras.Sequential()
model.add(keras.layers.Dense(len(train_df.columns)))
model.add(keras.layers.Dense(6))


In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
test_x = np.array([np.array([100 for _ in range(len(train_df.columns))])])

In [None]:
test_y = np.array([np.array([1])])

In [None]:
model.fit(X_train, y_train)

In [None]:
from tensorflow import keras
import tensorflow as tf
model = keras.Sequential()
model.add(keras.layers.Dense(len(train_df.columns)))
model.add(keras.layers.Dense(255, activation = "tanh"))
model.add(keras.layers.Dense(255, activation="relu"))
# tanh and then relu has the best result
model.add(keras.layers.Dense(5))
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

## same result with tanh and sigmoid

In [None]:
test = {"hi": 2 }

In [None]:
test.get("haha", 0)

In [None]:
def create_word_count_dict(words):
    word_count = {}
    for w in words:
        if w in word_count:
            word_count[w] += 1
        else:
            word_count[w] = 1
    return word_count

In [None]:
create_word_count_dict(words[0])

In [None]:
word_counts = list(map(lambda w: create_word_count_dict(w),words))

In [None]:
word_sample = all_words[:100_000]
unique_words_sample = list(set(word_sample))
len(unique_words_sample)
count_word = {}
for i, word in enumerate(unique_words_sample):
    if i % 1_000 == 0:
        print(i)
    count_word[word] = word_sample.count(word)
most_used_words = list(map(lambda x: x[0],sorted(count_word.items(), key= lambda x: x[1], reverse=True)))
unique_words = list(map(lambda x: set(x), words))


In [None]:
important_words = most_used_words[:2_000]

In [None]:
df_copy = start_df.copy()

In [None]:
for i, unique_word in enumerate(important_words):
    if i % 100 == 0:
        print(f"{i} / {len(important_words)}")
    df_copy["amount " + unique_word] = list(map(lambda w: w.get(unique_word, 0), word_counts))

In [None]:
df_copy.head()

In [None]:
train_df = df_copy.copy()
train_df["Rating"] = train_df["Rating"].apply(lambda x: x-1)
print(train_df["Rating"].describe())
y = train_df["Rating"]
train_df.drop("Rating", axis=1, inplace=True)
train_df.drop("Review", axis=1, inplace=True)
X = train_df.values

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.33)
X_train = np.array(list(map(lambda x: np.array(x), X_train)))
y_train = np.array(y_train)
X_test = np.array(list(map(lambda x: np.array(x), X_test)))
y_test = np.array(y_test)

In [None]:
from tensorflow import keras
import tensorflow as tf
model = keras.Sequential()
model.add(keras.layers.Dense(len(train_df.columns)))
model.add(keras.layers.Dense(255, activation = "tanh"))
model.add(keras.layers.Dense(255, activation="relu"))
# tanh and then relu has the best result
model.add(keras.layers.Dense(5))
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

two words next to each other

In [None]:
words[:10]

In [None]:
two_words = []

In [None]:
def get_all_two_words(words):
    two_words = []
    for i in range(1, len(words)):
        two_words.append((words[i-1], words[i]))
    return two_words
        

In [None]:
" ".join(words[0])

In [None]:
get_all_two_words(words[0])

In [None]:
two_words_ls = list(map(get_all_two_words, words))

In [None]:
for i in range(1, 6):
    print(str(i) + ":" + str(list(y).count(i)))

In [None]:
all_two_words = []
for two_words in two_words_ls:
    all_two_words.extend(two_words)

In [None]:
new_df_copy = start_df.copy()

In [None]:
new_df_copy = df_copy.copy()

In [None]:
two_words_sample = all_two_words[:100_000]
unique_two_words_sample = list(set(two_words_sample))
count_word = {}
print(len(unique_two_words_sample))
for i, word in enumerate(unique_two_words_sample):
    if i % 1_000 == 0:
        print(i)
    count_word[word] = two_words_sample.count(word)
most_used_two_words = list(map(lambda x: x[0],sorted(count_word.items(), key= lambda x: x[1], reverse=True)))

In [None]:
important_two_words= most_used_two_words[:5_000]

In [None]:
important_two_words[:20]

In [None]:
def create_two_word_count_dict(words):
    word_count = {}
    for w in words:
        if w in word_count:
            word_count[w] += 1
        else:
            word_count[w] = 1
    return word_count

In [None]:
two_words_count_ls = list(map(create_two_word_count_dict, two_words_ls))

In [None]:
for i, unique_word in enumerate(important_two_words):
    if i % 100 == 0:
        print(f"{i} / {len(important_two_words)}")
    new_df_copy["amount " + str(unique_word)] = list(map(lambda w: w.get(unique_word, 0), two_words_count_ls))

In [None]:
new_df_copy.head()

In [None]:
train_df = new_df_copy.copy()
train_df["Rating"] = train_df["Rating"].apply(lambda x: x-1)
print(train_df["Rating"].describe())
y = train_df["Rating"]
train_df.drop("Rating", axis=1, inplace=True)
train_df.drop("Review", axis=1, inplace=True)
X = train_df.values

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.33)
X_train = np.array(list(map(lambda x: np.array(x), X_train)))
y_train = np.array(y_train)
X_test = np.array(list(map(lambda x: np.array(x), X_test)))
y_test = np.array(y_test)

In [None]:
"""from tensorflow import keras
import tensorflow as tf
model = keras.Sequential() 
model.add(keras.layers.Dense(len(train_df.columns)))
model.add(keras.layers.Dense(1_000, activation = "tanh"))
model.add(keras.layers.Dense(1_000, activation="relu"))
# tanh and then relu has the best result
model.add(keras.layers.Dense(5))
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
model.fit(X_train, y_train, epochs=100, validation_data=(X_test, y_test))
"""