In [1]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
train_csv = pd.read_csv(os.path.join(dirname, 'goodreads_train.csv'))
test_csv = pd.read_csv(os.path.join(dirname, 'goodreads_test.csv'))

train_csv.head()

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [4]:
train = reduce_mem_usage(train_csv)
train = reduce_mem_usage(test_csv)

# First impressions
We have a dataframe with 900k reviews about several books. From these, we may be able to predict the review range in the test dataframe (from 1 to 5).

In [5]:
train_csv = train_csv[:30000]
print('n of rows : '+str(train_csv.shape[0])+' n of columns: '+str(train_csv.shape[1]))
train_csv.info()
train_csv.describe()

# Users behavior
We need to check if there are users that posts multiple reviews and how much of those. From these users, we can discover if there is any pattern in they reviews - some users may post negative reviews, but others seems to have a nice impression of the books in general.

The conclusion is that the top 10 users in number of reviews has 990+ reviews each.

In [6]:
print(train_csv['user_id'].value_counts().head(10))
print(len(train_csv['user_id'].value_counts()))
print('total users with less than 10 reviews: '+ str(sum((train_csv['user_id'].value_counts()<10))))
print('total users with less than 50 reviews: '+ str(sum(train_csv['user_id'].value_counts()<50)))
print('total users with less than 100 reviews: '+ str(sum(train_csv['user_id'].value_counts()<100)))
print('total users with less than 500 reviews: '+ str(sum(train_csv['user_id'].value_counts()<500)))

In [7]:
import matplotlib.pyplot as plt

mean_rating_by_user = train_csv.groupby(['user_id'])['rating'].mean()

mean_rating_by_user.describe()

# What does the best reviews has in common?

In [8]:
print('The mean of all reviews is a rating of: '+str(train_csv['rating'].mean()))
rating_counts = train_csv['rating'].value_counts()

print(rating_counts.keys())

fig = plt.figure(figsize = (10, 5))
plt.bar(rating_counts.keys(), rating_counts, color ='maroon',
        width = 0.4)
plt.show()


In [9]:
votes_by_rating = train_csv.groupby('rating')['n_votes'].sum()
print(votes_by_rating)

fig = plt.figure(figsize = (10, 5))
plt.bar(votes_by_rating.keys(), votes_by_rating, color ='maroon',
        width = 0.4)
plt.show()

# How users treat a same book?

In [10]:
numbers_by_book = train_csv.groupby('book_id')['rating'].sum()


# Treating the reviews text

In [11]:
review_text_by_word = { i : el.split() for i, el in enumerate(train_csv['review_text']) }
review_text_by_word_test = { i : el.split() for i, el in enumerate(test_csv['review_text']) }


In [12]:
def get_bow(el):
    
    filtered_bow = clean_bow(el)
    bow_dict = dict.fromkeys(filtered_bow)
    
    for word in filtered_bow:
        bow_dict[word] = filtered_bow.count(word)
    
    df_bow = pd.DataFrame({'Word': list(bow_dict.keys()), 'Frequency': list(bow_dict.values())})
    return df_bow.sort_values(by='Frequency', ascending=False)


def clean_bow(el):
    new_list = []
    
    bag_of_garbage = ['-', 'a', 'about', 'and', 'as', 'at', 'book', 'by', 'for', 'i', 'in', 
                      'is', 'it', 'of', 'the', 'that', 'this', 'to', 'was']
    
    for word in el:
        word = word.lower()
        if not word in bag_of_garbage:
            new_list.append(word)
    
    return new_list
    

#### Cleaning data from garbage words

In [13]:
for i, text in enumerate(review_text_by_word.values()):
    review_text_by_word[i] = clean_bow(text)


In [14]:
#check if there is good words in it

def get_positive_bow(el):
    
    positive_bow = ['awesome', 'beautiful', 'enjoy', 'enjoyed', 'enjoying', 'fascinating', 
                    'fantastic', 'favorite', 'favorites',
                    'gold', 'good', 'great', 'happy', 'interesting', 
                    'like', 'love', 'loved', 'nice', 'recommended']
    negative_bow = ['1', '2', 'annoying', 'bad', 'bored', 'boring', "couldn't", 'disappoint', 'disappointed', 'disappointment',
                    'horible', 'inaccurate', 'ruin', 'ruined', 'shit', 'stopped', 'sucks', 'terrible', 'tried', "wasn't"]
    negative_bow_to_connect = ["didn't", "don't", 'no', 'not']
    negative_bow_connected = ['care', 'connect', 'finish', 'interest', 'much', 'need', 'sure', 'work', 'rating']
    positive_counter = 0
    negative_counter = 0
    text_classification = 1  ##### 0 = negative, 1 = positive. as most part of the 
                             ####  books are well rated, I set positive as default
    
    for i, word in enumerate(el):
        if word.lower() in negative_bow_to_connect: #if there is a 'not' or other negative word, check if the next word is a positive one. If true, the negative counter recieves a high number of points
            if el[i] != el[-1] and el[i+1].lower() in positive_bow:
                negative_counter+=10
        elif word.lower() in positive_bow:
            positive_counter+=1
        elif word.lower() in negative_bow:
            negative_counter +=3
    
    if negative_counter > positive_counter:
        text_classification = 0
    
    return text_classification

train_csv['text_category'] = 0
test_csv['text_category'] = 0

for i, el in enumerate(review_text_by_word.values()):
    train_csv['text_category'][i] = get_positive_bow(el)


for i, el in enumerate(review_text_by_word_test.values()):
    test_csv['text_category'][i] = get_positive_bow(el)
    
train_csv.head()

In [15]:
import collections
#train_csv[['rating', 'text_category']][700:710]
#train_csv['review_text'][702]

#print(train_csv.loc[train_csv['rating']==1][['review_text', 'text_category']])
#train_csv['review_text'][801]

bag_of_garbage = ['-', 'a', 'at', 'about', 'and', 'as', 'at', 'book', 'but', 'by', 'for',
                  'have', 'her', 'i', 'if', 'in', 'is', 'it', 'just', 'my', 'of', 'she', 
                  'so', 'the', 'that', 'this', 'to', 'was', 'with', 'you']

zero_string = ' '.join(list(train_csv.loc[train_csv['rating']==0]['review_text'])).lower()
zero_bow = zero_string.split()


for word in bag_of_garbage:
    new_bow = filter(lambda a: a != word, zero_bow)
    zero_bow = list(new_bow)


#print(zero_bow)
#print(collections.Counter(zero_bow))

In [16]:
count_not = 0
new_bow = []

for i, word in enumerate(zero_bow):
    if word == "didn't":
        count_not+=1
        new_bow.append(zero_bow[i+1])

print(count_not)
print(collections.Counter(new_bow))

#### Is categories really effective?
Check if the category of bad (0) or good (1) reviews are beeing effectivelly working. 

In [17]:
rating_by_category = train_csv.groupby('text_category')
print(sum(rating_by_category['rating'].value_counts()))
print(rating_by_category['rating'].value_counts())


# Initial tests to the model

In [18]:
#Adjust categorical string values to integer values.
## user_id
user_id_dict =  { el : i for i, el in enumerate(train_csv['user_id']) }
user_id_dict_invert =  { i : el for i, el in enumerate(user_id_dict.keys()) }
#train_csv.replace({'user_id': user_id_dict}, inplace=True)
#test_csv.replace({'user_id': user_id_dict}, inplace=True)
train_csv.head()

In [19]:
train_csv[['year_of_review', 'month_of_review']] = 0
test_csv[['year_of_review', 'month_of_review']] = 0

months_dict = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 
               'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}

for i, el in enumerate(train_csv['date_added']):
    train_csv['year_of_review'][i] = train_csv['date_added'][i][-4:]
    month = train_csv['date_added'][i][4:7]
    train_csv['month_of_review'][i] = months_dict[month]


for i, el in enumerate(test_csv['date_added']):
    test_csv['year_of_review'][i] = test_csv['date_added'][i][-4:]
    month = test_csv['date_added'][i][4:7]
    test_csv['month_of_review'][i] = months_dict[month]
    
train_csv['month_of_review'].head()

In [20]:

#train_csv.replace({'month_of_review': user_id_dict}, inplace=True)
#train_csv['month_of_review'].head(20)

month = 'Sep'
print(months_dict[month])

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder


y = train_csv['rating']
X = train_csv[['book_id', 'text_category']]
X = train_csv[['book_id', 'n_votes', 'book_id', 'n_comments', 'text_category', 'year_of_review', 'month_of_review']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

model = LinearSVC()
model.fit(X_train, y_train)
prediction = model.predict(X_test)

accuracy = accuracy_score(y_test, prediction) * 100
print(f'{accuracy}% accuracy')

# Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier

y = train_csv['rating']

#features = ['n_votes', 'book_id', 'n_comments', 'text_category', 'year_of_review', 'month_of_review']
features = [,'text_category']

X = pd.get_dummies(train_csv[features])
X_test = pd.get_dummies(test_csv[features])

pd.get_dummies(train_csv[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)

predictions = model.predict(X_test)
output = pd.DataFrame({'review_id': test_csv.review_id, 'rating': predictions})
output.to_csv('submission.csv', index=False)
print('Your submission was successfully saved')