In [1]:
import pandas as pd
import numpy as np
import os
import csv

# for EDA
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
%matplotlib inline   
from collections import Counter

# for visualizations
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

# for NLP
from textatistic import Textatistic
import spacy
from spacy import displacy

# for Statistics
from scipy import stats

In [2]:
df = pd.read_csv("train_reviews_data", index_col=0)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 194420 entries, 135890 to 141680
Data columns (total 2 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   comments         194420 non-null  object
 1   rating_category  194420 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 4.4+ MB


In [6]:
nlp = spacy.load('en_core_web_lg')

In [7]:
# lemmatization function
def lemmatize(text):
    # create nlp object
    nobj = nlp(text)
    
    # stopword list from spacy
    stopwords = spacy.lang.en.stop_words.STOP_WORDS
    
    # simple lemmas
    lemmas = [token.lemma_ for token in nobj]
    
    # lowercase; remove non-alphabetic characters & overly used words, i.e., stop words
    a_lemmas = [lemma.lower() for lemma in lemmas
                if lemma.isalpha() 
                and lemma not in stopwords]
    
    # strings
    strings = " ".join(a_lemmas)
    
    return strings

In [8]:
lemmatize(" ! i don't, won't,   can't not use    NLP 27x maaaah?")

'i use nlp maaaah'

In [9]:
#df["lemmas"] = df.comments.apply(lemmatize)

In [10]:
#df.to_csv("lemmatized_reviews.csv")

In [11]:
df_lemmatized = pd.read_csv("lemmatized_reviews.csv")
df_lemmatized.head()

Unnamed: 0.1,Unnamed: 0,comments,rating_category,lemmas
0,135890,"La estancia ha sido de 10, la limpieza impecab...",3,la estancia ha sido de la limpieza impecable u...
1,239898,"Robert is an excelent host, who was always ava...",5,robert excelent host available kind need apart...
2,105455,My fiance and I booked Elena's place for ourse...,4,fiance i book elena place fiance parent saint ...
3,19032,I made the reservation for my uncles and they ...,2,i reservation uncle enjoy service service maev...
4,153857,We spent three nights in Dublin and really had...,5,spend night dublin great time carrie ann lovel...


In [12]:
from langdetect import detect
print(detect(df_lemmatized['lemmas'].iloc[0]))
print(detect(df_lemmatized['lemmas'].iloc[1]) == 'en')

es
True


In [13]:
print(len(df_lemmatized))

194420


In [15]:
for index, row in df_lemmatized.iterrows():
    txt =  row["lemmas"]
    is_eng = False
    if(type(txt) == str):
        is_eng = (detect(txt) == 'en')
          
    if is_eng == False:
        df_lemmatized.drop(index, inplace=True)

In [16]:
df_lemmatized['lemmas'].head()

1    robert excelent host available kind need apart...
2    fiance i book elena place fiance parent saint ...
3    i reservation uncle enjoy service service maev...
4    spend night dublin great time carrie ann lovel...
5    john nice available room clean cosy miss cook ...
Name: lemmas, dtype: object

In [17]:
df_lemmatized.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 159539 entries, 1 to 194419
Data columns (total 4 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Unnamed: 0       159539 non-null  int64 
 1   comments         159539 non-null  object
 2   rating_category  159539 non-null  int64 
 3   lemmas           159539 non-null  object
dtypes: int64(2), object(2)
memory usage: 6.1+ MB


In [18]:
df_lemmatized.to_csv("lemmatized_reviews.csv")

## Word clouds for each category

In [19]:
# wordcloud function for readability
def word_cloud(column):
    
    # Combine documents
    text = " ".join( _ for _ in column)
    
    # Sanity check
    print ("There are", len(text) ,"words")

    # Generate image
    plt.figure(figsize=(12, 10))
    wordcloud = WordCloud(min_font_size=5, 
                          max_font_size=60, 
                          max_words=20, 
                          background_color="white").generate(text)

    # Display the generated image
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()

In [20]:
df_lemmatized['lemmas'] = df_lemmatized['lemmas'].map(str)

In [None]:
word_cloud(df_lemmatized[df_lemmatized.rating_category == 1].lemmas)

There are 763459 words


# Here is where the analysis really starts

In [4]:
df = pd.read_csv("lemmatized_reviews.csv", index_col=0)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 159539 entries, 1 to 194419
Data columns (total 4 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Unnamed: 0       159539 non-null  int64 
 1   comments         159539 non-null  object
 2   rating_category  159539 non-null  int64 
 3   lemmas           159539 non-null  object
dtypes: int64(2), object(2)
memory usage: 6.1+ MB


In [6]:
df = df.iloc[: , 2:]

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 159539 entries, 1 to 194419
Data columns (total 2 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   rating_category  159539 non-null  int64 
 1   lemmas           159539 non-null  object
dtypes: int64(1), object(1)
memory usage: 3.7+ MB


In [8]:
df["length"] =df["lemmas"].str.len()
df["length"].mean()

141.81959270147112

In [9]:
df["len_cat"] = pd.cut(df["length"],
                      bins=[0., 10, 20, 50, 100, 141, 200, np.inf],
                      labels=[1, 2, 3, 4, 5, 6, 7])

In [10]:
df["len_cat"].value_counts()

4    44959
7    33072
5    28299
6    26949
3    22582
2     3128
1      550
Name: len_cat, dtype: int64

In [11]:
insufficient_information = [1, 2, 3]

df_cleaned = df[~df['len_cat'].isin(insufficient_information)]

In [13]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 133279 entries, 1 to 194419
Data columns (total 4 columns):
 #   Column           Non-Null Count   Dtype   
---  ------           --------------   -----   
 0   rating_category  133279 non-null  int64   
 1   lemmas           133279 non-null  object  
 2   length           133279 non-null  int64   
 3   len_cat          133279 non-null  category
dtypes: category(1), int64(2), object(1)
memory usage: 4.2+ MB


In [14]:
from sklearn.feature_extraction.text import CountVectorizer

X_train = df_cleaned["lemmas"]
y_train = df_cleaned["rating_category"]

In [16]:
vectorizer = CountVectorizer(lowercase = False)
X_train = vectorizer.fit_transform(X_train)

In [17]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=0)
grid = {'n_estimators' : [1, 30], 
        'max_features' : ['auto', 'log2'],  
        'max_depth' : [ 3, 5, 7, 10, 12, 15], 
        'criterion' : ['gini', 'entropy']}


In [29]:
from sklearn.model_selection import GridSearchCV
def best_model(model, grid, X_train, y_train):
    gs = GridSearchCV(model, grid)
    gs.fit(X_train, y_train)
    print("CV PARAMS: ", gs.best_params_)
    print("BEST MODEL: ", gs.best_estimator_)
    best_model = gs.best_estimator_
    return best_model


In [30]:
best_mod = best_model(model, grid, X_train, y_train)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(


CV PARAMS:  {'criterion': 'gini', 'max_depth': 15, 'max_features': 'auto', 'n_estimators': 30}
BEST MODEL:  RandomForestClassifier(max_depth=15, max_features='auto', n_estimators=30,
                       random_state=0)


In [31]:
from sklearn.metrics import accuracy_score
y_pred_train = best_mod.predict(X_train)
acc = accuracy_score(y_pred_train, y_train)
print(round(acc*100, 2), "%", sep="")

40.29%


In [32]:
from sklearn.dummy import DummyClassifier


clf = DummyClassifier(strategy='stratified')
clf.fit(X_train, y_train)
y_pred_dummy = clf.predict(X_train)
acc = accuracy_score(y_pred_dummy, y_train)
print(round(acc*100, 2), "%", sep="")

24.65%
