In [1]:
import pandas as pd

In [2]:
sw = pd.read_csv('SentiWordNet.txt', encoding='windows-1251', sep='\t')
sw.head()

Unnamed: 0,Word,Positive,Negative,PartOfSpeech,Definition
0,able,0.125,0.0,a,(usually followed by `to') having the necessar...
1,unable,0.0,0.75,a,(usually followed by `to') not having the nece...
2,dorsal,0.0,0.0,a,facing away from the axis of an organ or organ...
3,abaxial,0.0,0.0,a,facing away from the axis of an organ or organ...
4,ventral,0.0,0.0,a,nearest to or facing toward the axis of an org...


In [3]:
pw = pd.read_csv('positive-words.txt', encoding='windows-1251', sep='\t').values.flatten()
pw

array(['abound', 'abounds', 'abundance', ..., 'zenith', 'zest', 'zippy'],
      dtype=object)

In [4]:
nw = pd.read_csv('negative-words.txt', encoding='windows-1251', sep='\t', header=None).values.flatten()
nw

array(['2-faced', '2-faces', 'abnormal', ..., 'zealous', 'zealously',
       'zombie'], dtype=object)

In [5]:
train = pd.read_csv('train.csv', header=0, encoding='windows-1251')
train.head()

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating
0,0,Park Hyatt,Refuge in Chennai,Excellent room and exercise facility. All arou...,80.0
1,1,Hilton Chennai,Hilton Chennai,Very comfortable and felt safe. \r\nStaff were...,100.0
2,2,The Royal Regency,No worth the rating shown in websites. Pricing...,Not worth the rating shown. Service is not goo...,71.0
3,3,Rivera,Good stay,"First of all nice & courteous staff, only one ...",86.0
4,4,Park Hyatt,Needs improvement,Overall ambience of the hotel is very good. In...,86.0


In [6]:
test = pd.read_csv('test.csv', header=0, encoding='windows-1251')
test.head()

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text
0,2351,ITC Grand Chola,Mr Neeraj,On the night of my arrival from NY I had a min...
1,2352,Hotel Pandian,,Not so great. But it is still acceptable. Bit...
2,2353,Oyo Rooms Guindy Olympia Tech Park,Nice stay for corporate people,Been a good place to stay for people who visit...
3,2354,OYO Apartments Saidapet,Average hotel,Not worth of the money we paid.worst ac.no wat...
4,2355,Ramada Chennai Egmore,A good mid range corporate hotel,"A well located hotel, with decent sized rooms ..."


In [7]:
X_train = train.iloc[:,2:-1].fillna('')
y_train = train.Rating

In [8]:
X_train.head()

Unnamed: 0,Review_Title,Review_Text
0,Refuge in Chennai,Excellent room and exercise facility. All arou...
1,Hilton Chennai,Very comfortable and felt safe. \r\nStaff were...
2,No worth the rating shown in websites. Pricing...,Not worth the rating shown. Service is not goo...
3,Good stay,"First of all nice & courteous staff, only one ..."
4,Needs improvement,Overall ambience of the hotel is very good. In...


In [9]:
import re

In [10]:
import nltk
from nltk.corpus import stopwords
sw_eng = set(stopwords.words('english'))

In [11]:
from nltk import WordNetLemmatizer
from nltk import wordnet, pos_tag

def get_wordnet_pos(treebank_tag):
    
    my_switch = {
        'J': wordnet.wordnet.ADJ,
        'V': wordnet.wordnet.VERB,
        'N': wordnet.wordnet.NOUN,
        'R': wordnet.wordnet.ADV,
    }
    
    for key, item in my_switch.items():
        if treebank_tag.startswith(key):
            return item
    return wordnet.wordnet.NOUN

def my_lemmatizer(sent):
    lemmatizer = WordNetLemmatizer()
    tokenized_sent = sent.split()
    pos_tagged = [(word, get_wordnet_pos(tag))
                 for word, tag in pos_tag(tokenized_sent)]
    return ' '.join([lemmatizer.lemmatize(word, tag)
                    for word, tag in pos_tagged])

In [12]:
def preproc(x):
    lemmatizer = my_lemmatizer(' '.join([re.sub('[(){}<>]', '', i).strip('-') for i in re.split(r'[\n\r\s\d.,&?!*;:]', \
                                            x.lower()) if re.sub('[(){}<>]', '', i).strip('-') not in ('', '-')]))
    
    return ' '.join([i for i in lemmatizer.split() if i not in sw_eng])

In [13]:
rev_title_words = X_train.Review_Title.apply(preproc)
rev_text_words = X_train.Review_Text.apply(preproc)

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
tdidf = TfidfVectorizer(ngram_range=(1,3), stop_words='english')

In [19]:
data_words = rev_title_words + ' ' + rev_text_words

In [20]:
train_words = tdidf.fit_transform(data_words.values)

In [21]:
title_len = X_train.Review_Title.apply(len)
text_len = X_train.Review_Text.apply(len)
total_len = title_len + text_len

In [22]:
title_count = X_train.Review_Title.apply(lambda x: len(x.split()))
text_count = X_train.Review_Text.apply(lambda x: len(x.split()))
total_count = title_count + text_count 

In [23]:
pos_numb = data_words.apply(lambda x: sum([1 for i in x.split() if i in pw]))
neg_numb = data_words.apply(lambda x: sum([1 for i in x.split() if i in nw]))

In [24]:
pos = {i:j for i, j in zip(sw.Word, sw.Positive)}
neg = {i:j for i, j in zip(sw.Word, sw.Negative)}

In [25]:
pos_count = data_words.apply(lambda x: sum([pos.get(i,0) for i in x.split() if i in pw]))
neg_count = data_words.apply(lambda x: sum([neg.get(i,0) for i in x.split() if i in pw]))

In [26]:
import numpy as np

In [27]:
train_feat = pd.DataFrame((title_len, text_len, total_len, title_count, text_count, total_count, 
           pos_numb, neg_numb, pos_count, neg_count)).T

In [28]:
train_feat.shape, train_words.shape

((2351, 10), (2351, 57132))

In [29]:
train_all = pd.DataFrame(train_words.todense()).join(train_feat).values

In [34]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.naive_bayes import GaussianNB

In [36]:
from sklearn.metrics import mean_squared_error as MSE

def sqrt_mse(y_true, y_pred):
    return np.sqrt(MSE(y_true, y_pred))

In [53]:
Gauss = GaussianNB(var_smoothing=6.315826315789473e-11).fit(train_all, y_train)

In [54]:
y_pred_train = Gauss.predict(train_all)

sqrt_mse(y_train, y_pred_train)

5.403851990337551

In [231]:
np.sqrt(np.square(y_pred_train - 73).mean())

13.611013547335249

Теперь то же самое для тестовой выборки

In [55]:
X_test = test.iloc[:,2:].fillna('')
X_test.head()

Unnamed: 0,Review_Title,Review_Text
0,Mr Neeraj,On the night of my arrival from NY I had a min...
1,,Not so great. But it is still acceptable. Bit...
2,Nice stay for corporate people,Been a good place to stay for people who visit...
3,Average hotel,Not worth of the money we paid.worst ac.no wat...
4,A good mid range corporate hotel,"A well located hotel, with decent sized rooms ..."


In [57]:
rev_title_words = X_test.Review_Title.apply(preproc)
rev_text_words = X_test.Review_Text.apply(preproc)

data_words = rev_title_words + ' ' + rev_text_words
test_words = tdidf.transform(data_words.values)

title_len = X_test.Review_Title.apply(len)
text_len = X_test.Review_Text.apply(len)
total_len = title_len + text_len

title_count = X_test.Review_Title.apply(lambda x: len(x.split()))
text_count = X_test.Review_Text.apply(lambda x: len(x.split()))
total_count = title_count + text_count

pos_numb = data_words.apply(lambda x: sum([1 for i in x.split() if i in pw]))
neg_numb = data_words.apply(lambda x: sum([1 for i in x.split() if i in nw]))

pos_count = data_words.apply(lambda x: sum([pos.get(i,0) for i in x.split() if i in pw]))
neg_count = data_words.apply(lambda x: sum([neg.get(i,0) for i in x.split() if i in pw]))

test_feat = pd.DataFrame((title_len, text_len, total_len, title_count, text_count, total_count, 
           pos_numb, neg_numb, pos_count, neg_count)).T

test_all = pd.DataFrame(test_words.todense()).join(test_feat).values

In [58]:
y_pred = Gauss.predict(test_all)

In [59]:
sub = pd.DataFrame(test.Id.astype(str) + ',' + y_pred.astype(str))
sub.columns = ['Id, Rating']
sub.head()

Unnamed: 0,"Id,Rating"
0,2351100.0
1,235271.0
2,235357.0
3,235486.0
4,235586.0


In [60]:
sub.to_csv('sub_Kopyl_Gauss.csv', index=False)