In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn import metrics

from src.cleantext import *

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.models import Sequential

In [None]:
#pip install nltk

# Import Yelp dataset

In [None]:
data = pd.read_csv('../cap2/data/predicted_review.csv').drop(columns= ['business_id', 'user_id'])

In [None]:
data['True(1)/Deceptive(0)'] = (data['True(1)/Deceptive(0)'] == 'True').astype(bool)

In [None]:
type(data['True(1)/Deceptive(0)'][0])

**seperating to 2 dfs**

In [None]:
true = data[data['True(1)/Deceptive(0)'] == 1]

In [None]:
true.to_csv('true.csv')

In [None]:
decep = data[data['True(1)/Deceptive(0)'] == 0]
decep.to_csv('decep.csv')

In [10]:
true = pd.read_csv('true.csv').drop(columns='Unnamed: 0')

In [11]:
decep = pd.read_csv('decep.csv').drop(columns='Unnamed: 0')

In [42]:
true.isnull().sum()

Review                  0
Stars                   0
True(1)/Deceptive(0)    0
dtype: int64

In [43]:
decep.isnull().sum()

Review                  0
Stars                   0
True(1)/Deceptive(0)    0
dtype: int64

Sample 100,000 data for faster processing

In [12]:
true_samp = true.sample(100000, random_state = 123)

In [13]:
decep_samp = decep.sample(100000, random_state = 123)

**pre processing the data**

clean text

In [14]:
true_samp.Review = cleanText(true_samp.Review)

In [15]:
decep_samp.Review = cleanText(decep_samp.Review)

tokenize

In [16]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/suchaya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [17]:
true_samp.Review = true_samp['Review'].apply(word_tokenize)

In [18]:
decep_samp.Review = decep_samp['Review'].apply(word_tokenize)

Remove stopwords

In [19]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/suchaya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
stopwords_ = set(stopwords.words('english'))

In [21]:
true_samp.Review = true_samp.Review.apply(lambda x: [item for item in x if item not in stopwords_ ])

In [22]:
decep_samp.Review = decep_samp.Review.apply(lambda x: [item for item in x if item not in stopwords_ ])

Stemming

In [23]:
stemmer = SnowballStemmer("english")
true_samp.Review = true_samp.Review.apply(lambda x: [stemmer.stem(y) for y in x])
decep_samp.Review = decep_samp.Review.apply(lambda x: [stemmer.stem(y) for y in x])

Detokenized

In [24]:
true_samp.Review = true_samp.Review.apply(lambda x:' '.join([y for y in x ]))
decep_samp.Review = decep_samp.Review.apply(lambda x:' '.join([y for y in x ]))

In [25]:
true_samp.to_csv('true_samp.csv')
decep_samp.to_csv('decep_samp.csv')

***Data Loaded to CSV checkpoint***

In [2]:
true_samp = pd.read_csv('true_samp.csv')
true_samp.drop(columns='Unnamed: 0', inplace = True)

In [3]:
decep_samp = pd.read_csv('decep_samp.csv')
decep_samp.drop(columns='Unnamed: 0', inplace = True)

Vectorize

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
def vectorize(X, max_features=None, ngram_range=None):
    '''
    X: Detokenized Column
    max_features: maximum features wanted (int)
    ngram_range: (n,n) 
    '''
    tfidf = TfidfVectorizer(max_features, ngram_range)
    doc_tfidf_matrix = tfidf.fit_transform(X).todense()
    vector = pd.DataFrame(doc_tfidf_matrix, columns = tfidf.get_feature_names())
    return vector

In [5]:
true_samp = true_samp[['Review', 'Stars']]

In [6]:
decep_samp = decep_samp[['Review','Stars']]

In [7]:
true_samp[true_samp.Review.isnull() == True]

Unnamed: 0,Review,Stars
6377,,4.0
20727,,3.0
21103,,1.0
28024,,3.0
32129,,4.0
33934,,3.0
53287,,4.0
58479,,4.0
75875,,4.0
79572,,2.0


**use first 50k of sampled data for faster processing / able to keep track of the sample set**

In [8]:
true_samp.Review = true_samp.Review.fillna('None')

In [9]:
true_samp_vec = vectorize(true_samp[:50000]['Review'],max_features= 1000)

In [10]:
decep_samp.Review = decep_samp.Review.fillna('None')

In [11]:
decep_samp_vec = vectorize(decep_samp[:50000]['Review'], max_features= 1000)

Combine vector with other feature 

In [None]:
true_frame = pd.concat([true_samp_vec, true_samp['Stars'][:50000]], axis = 1)

In [None]:
true_frame

In [None]:
decep_frame = pd.concat([decep_samp_vec, decep_samp[['Stars'][:50000]]], axis = 1)

split

In [None]:
from sklearn.model_selection import train_test_split
X_true_train, X_true_test = train_test_split(true_frame.values, test_size = 0.2, random_state = 123)

# First Simple Auto Enconding Model

In [None]:
model = Sequential()
model.add(Dense(25, input_dim=x_true_train.shape[1], activation='relu'))
model.add(Dense(3, activation='relu'))
model.add(Dense(25, activation='relu'))
model.add(Dense(x_normal.shape[1])) # Multiple output neurons
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_true_train,X_true_train,verbose=1,epochs=100)

In [None]:
pred = model.predict(X_true_test)
score1 = np.sqrt(metrics.mean_squared_error(pred,X_true_test))
pred = model.predict(true_frame.values)
score2 = np.sqrt(metrics.mean_squared_error(pred,true_frame.values))
pred = model.predict(decep_frame.values)
score3 = np.sqrt(metrics.mean_squared_error(pred,decept_frame.values))
print(f"Insample Normal Score (RMSE): {score1}".format(score1))
print(f"Out of Sample Normal Score (RMSE): {score2}")
print(f"Attack Underway Score (RMSE): {score3}")