In [87]:
import re
import nltk

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
english_stemmer=nltk.stem.SnowballStemmer('english')

from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import random
import itertools

import sys
import os
import argparse
from sklearn.pipeline import Pipeline
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer
import six
from abc import ABCMeta
from scipy import sparse
from scipy.sparse import issparse
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils import check_X_y, check_array
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.preprocessing import normalize, binarize, LabelBinarizer
from sklearn.svm import LinearSVC

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Lambda
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, SimpleRNN, GRU
from keras.preprocessing.text import Tokenizer
from collections import defaultdict
from keras.layers.convolutional import Convolution1D
from keras import backend as K

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm
%matplotlib inline
plt.style.use('ggplot')

In [88]:
data_file = './/Amazon_Unlocked_Mobile.csv'
n = 413000  
s = 20000 
skip = sorted(random.sample(range(1,n),n-s))


df = pd.read_csv( data_file, delimiter = ",", skiprows = skip)

In [89]:
df

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.00,5,muy buen producto,0.0
1,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.00,5,solid phone,0.0
2,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.00,5,Excellent Product.,0.0
3,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.00,4,"pros.. it works fine, its easy to use, not too...",1.0
4,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.00,5,excelente,0.0
...,...,...,...,...,...,...
20835,Samsung Convoy U640 Phone for Verizon Wireless...,Samsung,79.95,5,another great deal great price,0.0
20836,Samsung Convoy U640 Phone for Verizon Wireless...,Samsung,79.95,3,Ok,0.0
20837,Samsung Convoy U640 Phone for Verizon Wireless...,Samsung,79.95,5,Passes every drop test onto porcelain tile!,0.0
20838,Samsung Convoy U640 Phone for Verizon Wireless...,Samsung,79.95,3,I returned it because it did not meet my needs...,0.0


In [90]:
df.isnull().sum()

Product Name       0
Brand Name      3154
Price            270
Rating             0
Reviews            4
Review Votes     619
dtype: int64

In [91]:
df['Reviews']

0                                        muy buen producto
1                                              solid phone
2                                       Excellent Product.
3        pros.. it works fine, its easy to use, not too...
4                                                excelente
                               ...                        
20835                       another great deal great price
20836                                                   Ok
20837          Passes every drop test onto porcelain tile!
20838    I returned it because it did not meet my needs...
20839    Only downside is that apparently Verizon no lo...
Name: Reviews, Length: 20840, dtype: object

In [92]:
df = df.dropna(subset= ['Reviews'])

In [93]:
df

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.00,5,muy buen producto,0.0
1,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.00,5,solid phone,0.0
2,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.00,5,Excellent Product.,0.0
3,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.00,4,"pros.. it works fine, its easy to use, not too...",1.0
4,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.00,5,excelente,0.0
...,...,...,...,...,...,...
20835,Samsung Convoy U640 Phone for Verizon Wireless...,Samsung,79.95,5,another great deal great price,0.0
20836,Samsung Convoy U640 Phone for Verizon Wireless...,Samsung,79.95,3,Ok,0.0
20837,Samsung Convoy U640 Phone for Verizon Wireless...,Samsung,79.95,5,Passes every drop test onto porcelain tile!,0.0
20838,Samsung Convoy U640 Phone for Verizon Wireless...,Samsung,79.95,3,I returned it because it did not meet my needs...,0.0


### Rating Mapping

In [94]:
Review_mapping =  {0: '0',1 : '0', 2 :'0',3 : '0', 4 : '1', 5 : '1'}
Review_mapping

{0: '0', 1: '0', 2: '0', 3: '0', 4: '1', 5: '1'}

In [95]:
Rating = lambda x: Review_mapping.get(x,x)
df['Rating']=df.Rating.map(Rating)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [96]:
df['Rating'].unique()

array(['1', '0'], dtype=object)

### Text Analysis

In [98]:
df.columns

Index(['Product Name', 'Brand Name', 'Price', 'Rating', 'Reviews',
       'Review Votes'],
      dtype='object')

In [99]:
a = df.iloc[:,3]
print(a)

0        1
1        1
2        1
3        1
4        1
        ..
20835    1
20836    0
20837    1
20838    0
20839    1
Name: Rating, Length: 20836, dtype: object


In [100]:
b = df.iloc[:,4]

In [101]:
doc = pd.concat([a,b], axis = 1)

In [102]:
train_docs, test_docs = train_test_split(doc, test_size = 0.3)

In [103]:
def review_to_wordlist(review, remove_stopwords = True):

    
    review_text = re.sub('[^a-zA-Z]'," ", review)
    #review_text에 영어만 넣기 
    
    words = review_text.lower().split()
    #소문자로 바꿔주고 그것들을 분리해준다 
    
    if remove_stopwords:
        stops = set(stopwords.words('english'))
        words = [w for w in words if not w in stops]
        #stops에 영어의 불용어를 넣어줌 
        #words는 소문자로 변환되고 띄어져있는 것이며 
        #stops에 있는 불용어를 제외하고 넣어줌 
        
    b = []
    stemmer = english_stemmer
    for word in words:
        b.append(stemmer.stem(word))
        #words에 전처리된것들의 어간들만 추출 
        
    return(b)


In [104]:
clean_train_reviews = []
for review in train_docs['Reviews']:
    clean_train_reviews.append( " ".join(review_to_wordlist(review)))
    
clean_test_reviews = []
for review in test_docs['Reviews']:
    clean_test_reviews.append( " ".join(review_to_wordlist(review)))

In [19]:
c = train_docs['Rating']
d = test_docs['Rating']

In [20]:
c.isnull().sum()

0

In [21]:
len(c)

14583

In [22]:
len(d)

6251

In [23]:
train_docs.isnull().sum()

Rating     0
Reviews    0
dtype: int64

In [24]:
test_docs.isnull().sum()

Rating     0
Reviews    0
dtype: int64

# ================ 5/1

### Tf-Idf

In [118]:
X_train = clean_train_reviews
X_test = clean_test_reviews 

y_train = train_docs['Rating']
y_test = test_docs['Rating']

In [122]:
X_train

['even keep back piec attach pocket otterbox defend sever cell phone sturdi even back attach still fit shirt pocket dress shirt one much prettier other phone thus distinct understand peopl black case black cell phone look like everybodi els cell phone stupid travel black suitcas mine maroon',
 'realli like phone everyth need time replac iphon still would use backup need like flip phone also text friend',
 'great',
 'excel',
 'great phone',
 'iphon came origin appl packag two day standard ship work verizon like ad say new condit scuff anyth thank',
 'thisphon unlock friend use',
 'best phone everi own fast clean sleek ton memori could ask anyth better',
 'excelent',
 'phone stop work week',
 'happi',
 'kept drop call',
 'still get know phone well seem work ok get hot use',
 'appear phone look put sim card function motorola',
 'gave phone two star last long bought phone earli decemb start use januari stop work six month later batteri start overh whether charg phone alway hot recent scree

In [119]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from nltk.corpus import words

In [120]:
vectorizer = CountVectorizer(analyzer = 'word', 
                             lowercase = True,
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = 'english',
                             min_df = 2, # 토큰이 나타날 최소 문서 개수로 오타나 자주 나오지 않는 특수한 전문용어 제거에 좋다. 
                             ngram_range=(1, 3),
                             vocabulary = set(words.words()), # nltk의 words를 사용하거나 문서 자체의 사전을 만들거나 선택한다. 
                             max_features = 20000
                            )

In [121]:
pipeline = Pipeline([
    ('vect', vectorizer),
    ('tfidf', TfidfTransformer(smooth_idf = False))
])

In [123]:
X_train_tf_idf_vector = pipeline.fit_transform(X_train)

  idf = np.log(n_samples / df) + 1


In [124]:
X_test_tf_idf_vector = pipeline.fit_transform(X_test)

In [125]:
from sklearn.ensemble import RandomForestClassifier

In [126]:
forest = RandomForestClassifier(
    n_estimators = 100, n_jobs = -1, random_state=2018)
forest

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=2018, verbose=0,
                       warm_start=False)

In [127]:
%time forest = forest.fit(X_train_tf_idf_vector, y_train)

CPU times: user 5min 9s, sys: 883 ms, total: 5min 10s
Wall time: 1min 27s


In [128]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [129]:
k_fold = KFold(n_splits = 5, shuffle = True, random_state = 2018)

score = np.mean(cross_val_score(\
                               forest, X_train_tf_idf_vector, \
                               y_train, cv = k_fold, scoring = 'roc_auc', n_jobs = -1))

In [130]:
format(score)

'0.9109146273252845'

In [131]:
result = forest.predict(X_test_tf_idf_vector)

In [132]:
y_test

5215     1
9067     0
18853    1
8899     1
13640    1
        ..
5119     0
17938    1
15344    1
8916     0
13295    1
Name: Rating, Length: 6251, dtype: object

In [133]:
a = sum(y_test == result)
print("테스트 셋 정확도 :",a / len(y_test))

테스트 셋 정확도 : 0.8673812190049592


### 전체 데이터셋 활용

In [134]:
df = pd.read_csv('./Amazon_Unlocked_Mobile.csv')

In [135]:
df = df.dropna(subset= ['Reviews'])

In [136]:
df.isnull().sum()

Product Name        0
Brand Name      65155
Price            5930
Rating              0
Reviews             0
Review Votes    12296
dtype: int64

In [137]:
Review_mapping =  {0: '0',1 : '0', 2 :'0',3 : '0', 4 : '1', 5 : '1'}
Review_mapping

{0: '0', 1: '0', 2: '0', 3: '0', 4: '1', 5: '1'}

In [138]:
Rating = lambda x: Review_mapping.get(x,x)
df['Rating']=df.Rating.map(Rating)

In [139]:
df

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,1,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,1,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,1,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,1,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,1,Great phone to replace my lost phone. The only...,0.0
...,...,...,...,...,...,...
413835,Samsung Convoy U640 Phone for Verizon Wireless...,Samsung,79.95,1,another great deal great price,0.0
413836,Samsung Convoy U640 Phone for Verizon Wireless...,Samsung,79.95,0,Ok,0.0
413837,Samsung Convoy U640 Phone for Verizon Wireless...,Samsung,79.95,1,Passes every drop test onto porcelain tile!,0.0
413838,Samsung Convoy U640 Phone for Verizon Wireless...,Samsung,79.95,0,I returned it because it did not meet my needs...,0.0


In [140]:
a = df.iloc[:,3]
b = df.iloc[:,4]

In [141]:
doc = pd.concat([a,b], axis = 1)
train_docs, test_docs = train_test_split(doc, test_size = 0.3)

In [142]:
def review_to_wordlist(review, remove_stopwords = True):

    
    review_text = re.sub('[^a-zA-Z]'," ", review)
    #review_text에 영어만 넣기 
    
    words = review_text.lower().split()
    #소문자로 바꿔주고 그것들을 분리해준다 
    
    if remove_stopwords:
        stops = set(stopwords.words('english'))
        words = [w for w in words if not w in stops]
        #stops에 영어의 불용어를 넣어줌 
        #words는 소문자로 변환되고 띄어져있는 것이며 
        #stops에 있는 불용어를 제외하고 넣어줌 
        
    b = []
    stemmer = english_stemmer
    for word in words:
        b.append(stemmer.stem(word))
        #words에 전처리된것들의 어간들만 추출 
        
    return(b)



In [143]:
clean_train_reviews = []
for review in train_docs['Reviews']:
    clean_train_reviews.append( " ".join(review_to_wordlist(review)))
    
clean_test_reviews = []
for review in test_docs['Reviews']:
    clean_test_reviews.append( " ".join(review_to_wordlist(review)))

In [144]:
X_train = clean_train_reviews
X_test = clean_test_reviews

y_train = train_docs['Rating']
y_test = test_docs['Rating']

In [145]:
len(clean_train_reviews)

289644

In [146]:
vectorizer = CountVectorizer(analyzer = 'word', 
                             lowercase = True,
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = 'english',
                             min_df = 2, # 토큰이 나타날 최소 문서 개수로 오타나 자주 나오지 않는 특수한 전문용어 제거에 좋다. 
                             ngram_range=(1, 5),
                             vocabulary = set(words.words()), # nltk의 words를 사용하거나 문서 자체의 사전을 만들거나 선택한다. 
                             max_features = 300000
                            )

In [147]:
pipeline = Pipeline([
    ('vect', vectorizer),
    ('tfidf', TfidfTransformer(smooth_idf = False))
])

In [148]:
X_train_tf_idf_vector = pipeline.fit_transform(X_train)
X_test_tf_idf_vector = pipeline.fit_transform(X_test)

  idf = np.log(n_samples / df) + 1


In [149]:
forest = RandomForestClassifier(
    n_estimators = 100, n_jobs = -1, random_state=2018)
forest


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=2018, verbose=0,
                       warm_start=False)

In [150]:
%time forest = forest.fit(X_train_tf_idf_vector, y_train)

CPU times: user 1h 37min 15s, sys: 11.2 s, total: 1h 37min 26s
Wall time: 26min 28s


In [151]:
k_fold = KFold(n_splits = 5, shuffle = True, random_state = 2018)

score = np.mean(cross_val_score(\
                               forest, X_train_tf_idf_vector, \
                               y_train, cv = k_fold, scoring = 'roc_auc', n_jobs = -1))
print(format(score))

0.9716094368926219


In [152]:
result = forest.predict(X_test_tf_idf_vector)
a = sum(y_test == result)
print("테스트 셋 정확도 :",a / len(y_test))

테스트 셋 정확도 : 0.9392752992733658
