# 139 Project - Yelp

In [103]:
import pandas as pd

df = pd.read_csv('review.csv') 
del df['funny']
del df['user_id']
del df['review_id']
del df['date']
del df['business_id']
del df['useful']
del df['cool']

df_train = df.iloc[:100000]
df_test = df.iloc[100000:200000]
a = []
for i in df_train['stars']:
    if i >= 4:
        a.append(1)
    elif i == 3:
        a.append(0)
    else:
        a.append(-1)
df_train.loc[:,('rate')] = a

#del df_test['stars']

In [39]:
# lowercase reviews
df_train.loc[:,('text')] = df_train.loc[:,('text')].str.lower()
df_test.loc[:,('text')] = df_test.loc[:,('text')].str.lower()

In [48]:
# replace non-alphanumeric characters with a space for each line
import re
train_txt = [re.sub('\W+',' ', r ) for r in df_train.loc[:,('text')]]
test_txt = [re.sub('\W+',' ', r ) for r in df_test.loc[:,('text')]]

In [54]:
train_txt = [i.replace('not ', 'not') for i in train_txt]
test_txt = [i.replace('not ', 'not') for i in test_txt]

In [55]:
# transform docs into lists of words by tokenizing
from nltk import word_tokenize
train_docs = [word_tokenize(r) for r in train_txt]
test_docs = [word_tokenize(r) for r in test_txt]

In [57]:
# create a list of unimportant words to filter out
from nltk.corpus import stopwords
stopset = set(stopwords.words('english'))
stopset.add(unicode('would'))
stopset.add(unicode('they'))

In [58]:
# filter out stopwords and use stemming
from nltk.stem import PorterStemmer
ps = PorterStemmer()
train_docs = [[ps.stem(w).encode('utf-8') for w in d if w not in stopset] for d in train_docs]
test_docs = [[ps.stem(w).encode('utf-8') for w in d if w not in stopset] for d in test_docs]

In [63]:
def filterLen(docs, minlen):
    r""" filter out terms that are too short. 
    docs is a list of lists, each inner list is a document represented as a list of words
    minlen is the minimum length of the word to keep
    """
    return [ [t for t in d if len(t) >= minlen ] for d in docs ]

# filter out words less than 3 characters
train_arr = filterLen(train_docs, 3)
test_arr = filterLen(test_docs, 3)

In [68]:
import numpy as np
from numpy.linalg import norm
from collections import Counter, defaultdict
from scipy.sparse import csr_matrix
import scipy.sparse as sp

def cmer(name, c=3):
    r""" Given a name and parameter c, return the vector of c-mers associated with the name
    """
    name = name.lower()
    if len(name) < c:
        return [name]
    v = []
    for i in range(len(name)-c+1):
        v.append(name[i:(i+c)])
    return v

def build_matrix(docs):
    r""" Build sparse matrix from a list of documents, 
    each of which is a list of word/terms in the document.  
    """
    nrows = len(docs)
    idx = {}
    tid = 0
    nnz = 0
    for d in docs:
        nnz += len(set(d))
        for w in d:
            if w not in idx:
                idx[w] = tid
                tid += 1
    ncols = len(idx)
        
    # set up memory
    ind = np.zeros(nnz, dtype=np.int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows+1, dtype=np.int)
    i = 0  # document ID / row counter
    n = 0  # non-zero counter
    # transfer values
    for d in docs:
        cnt = Counter(d)
        keys = list(k for k,_ in cnt.most_common())
        l = len(keys)
        for j,k in enumerate(keys):
            ind[j+n] = idx[k]
            val[j+n] = cnt[k]
        ptr[i+1] = ptr[i] + l
        n += l
        i += 1
    print(val)
    print(ind)
    print(ptr)
    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()
    
    return mat

def csr_info(mat, name="", non_empy=False):
    r""" Print out info about this CSR matrix. If non_empy, 
    report number of non-empty rows and cols as well
    """
    if non_empy:
        print("%s [nrows %d (%d non-empty), ncols %d (%d non-empty), nnz %d]" % (
                name, mat.shape[0], 
                sum(1 if mat.indptr[i+1] > mat.indptr[i] else 0 
                for i in range(mat.shape[0])), 
                mat.shape[1], len(np.unique(mat.indices)), 
                len(mat.data)))
    else:
        print( "%s [nrows %d, ncols %d, nnz %d]" % (name, 
                mat.shape[0], mat.shape[1], len(mat.data)) )
        
# scale matrix and normalize its rows
def csr_idf(mat, copy=False, **kargs):
    r""" Scale a CSR matrix by idf. 
    Returns scaling factors as dict. If copy is True, 
    returns scaled matrix and scaling factors.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # document frequency
    df = defaultdict(int)
    for i in ind:
        df[i] += 1
    # inverse document frequency
    for k,v in df.items():
        df[k] = np.log(nrows / float(v))  ## df turns to idf - reusing memory
    # scale by idf
    for i in range(0, nnz):
        val[i] *= df[ind[i]]
        
    return df if copy is False else mat

def csr_l2normalize(mat, copy=False, **kargs):
    r""" Normalize the rows of a CSR matrix by their L-2 norm. 
    If copy is True, returns a copy of the normalized matrix.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # normalize
    for i in range(nrows):
        rsum = 0.0    
        for j in range(ptr[i], ptr[i+1]):
            rsum += val[j]**2
        if rsum == 0.0:
            continue  # do not normalize empty rows
        rsum = 1.0/np.sqrt(rsum)
        for j in range(ptr[i], ptr[i+1]):
            val[j] *= rsum
            
    if copy is True:
        return mat
        
def namesToMatrix(names, c):
    docs = [cmer(n, c) for n in names]
    return build_matrix(docs)

In [75]:
all_docs = train_arr + test_arr

In [78]:
csr_mat = build_matrix(all_docs)

[ 4.  3.  2. ...,  1.  1.  1.]
[  48    5   18 ..., 2796   23 3426]
[      0      84     155 ..., 9362186 9362338 9362400]


In [84]:
train_csr = csr_mat[:100000]
test_csr = csr_mat[100000:200000]

In [95]:
train_stars = [s for s in df_train['stars']]
train_rate = [r for r in df_train['rate']]

In [98]:
from sklearn import linear_model
clf_sgd = linear_model.SGDClassifier()
clf_sgd.fit(train_csr, train_stars)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [132]:
pred_stars_sgd = []
for row in test_csr:
    r = row.todense() 
    p = clf_sgd.predict(r)
    pred_stars_sgd.append(p[0])

In [125]:
true_stars = [s for s in df_test['stars']]

In [134]:
#print true_stars
matches = 0
for i in range(len):
    if pred_stars_sgd[i] == true_stars[i]:
        matches += 1

In [135]:
print matches

59193
