In [0]:
from google.colab import drive
drive.mount('/content/gdrive')
!ln -s gdrive/'Team Drives'/'Data Mining Team'/ gdata


In [0]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import sqlite3
from textblob import TextBlob
import spacy
from tqdm import tqdm
import pickle
import keras
import re
from sklearn import metrics
import scipy as sp

In [0]:
df = pickle.load(open("gdata/ModifiedData_wTokenized.pkl", 'rb'))

In [0]:
# ! pandas.version


In [0]:
df['Target'].value_counts()

In [0]:
#Train on other features
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
dfType = df.dtypes.apply(lambda x: x.name).to_dict()
df_ros, _ = ros.fit_resample(df, df['Target'])
df = pd.DataFrame(df_ros, columns=df.columns).astype(dfType)

In [0]:
from sklearn.model_selection import train_test_split

y = df['Target']
X = df.drop(['Target'], axis=1)

# Split the train data into X_train and y_train datasets in 80:20 ratio.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

#X_train, X_combine, y_train, y_combine = train_test_split(
#    X_train, y_train, test_size=0.2, random_state=42)

print("Train data shape : " + str(X_train.shape))
#print("Combine data shape : " + str(X_combine.shape))
print("Test data shape : " + str(X_test.shape))
print(y_train.value_counts())
#print(y_combine.value_counts())
print(y_test.value_counts())

In [0]:
X_train.head()
# X_train.shape
# y_train.shape

In [0]:
from sklearn.linear_model import SGDClassifier

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

text_clf = text_clf.fit(X_train['reviewContent'], y_train)


In [0]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}
# parameters = {'vect__ngram_range': [(1, 2)], 'clf__alpha': [1e-3], 'vect__min_df': [1e-05]}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train['reviewContent'], y_train)


In [0]:
print(gs_clf.best_score_)
print(gs_clf.best_params_)

In [0]:
predicted = gs_clf.predict(X_test['reviewContent'])
# np.mean(predicted == X_test['reviewContent'])

In [0]:
print(predicted)
print(metrics.classification_report(y_test, predicted))

In [0]:
from sklearn.externals import joblib
joblib.dump(gs_clf, 'gdata/Naive_Bayes_GS.skmodel')

In [0]:

from sklearn.linear_model import SGDClassifier

In [0]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])

text_clf_svm = text_clf_svm.fit(X_train['reviewContent'], y_train)


In [0]:
from sklearn.model_selection import GridSearchCV
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),'clf-svm__alpha': (1e-2, 1e-3)}

gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train['reviewContent'], y_train)


In [0]:
predicted_svm = text_clf_svm.predict(X_test['reviewContent'])
np.mean(predicted_svm == X_test['reviewContent'])


In [0]:
# The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary but will be used later.
# # We will be using the 'text_clf' going forward.


# text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

# text_clf = text_clf.fit(X_train['tokenized'], y_train)

In [0]:

# Next, we create an instance of the grid search by passing the classifier, parameters 
# and n_jobs=-1 which tells to use multiple cores from user machine.

# gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
# gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)

In [0]:
#Ref: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
#Don't fit test data. Only transform it.

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_idf = vectorizer.fit_transform(X_train['tokenized'])
X_test_idf = vectorizer.transform(X_test['tokenized'])

print(X_train_idf.shape)
#print(X_combine_idf.shape)
print(X_test_idf.shape)
print(y_train.shape)
#print(y_combine.shape)
print(y_test.shape)

In [0]:
from sklearn.linear_model import SGDClassifier

In [0]:
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])
text_clf_svm = text_clf_svm.fit(X_train, y_train)

In [0]:
#flagged = df.loc[df['flagged'] == "Y"].groupby('rating').mean()
flagged = df.loc[df['Target'] == 1]
sns.heatmap(data=flagged.corr(), annot=True)


In [0]:
#Combine models
X_combine["csrPredict"]  = nbModel.predict(X_combine_idf)

X_combine_features = X_combine.drop(['csrPredict', 'tokenized', 'date', 'reviewID', 'reviewerID', 'reviewContent', 'hotelID'], axis=1)

X_combine['featurePredict'] = modelXGB.predict(X_combine_features)



In [0]:
y = y_combine
X = X_combine[['csrPredict', 'featurePredict']].copy()

from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier()
sgd.fit(X,y)

In [0]:
#Test accuracy of combined models
X_test["csrPredict"]  = nbModel.predict(X_test_idf)

X_test_features = X_test.drop(['csrPredict', 'tokenized', 'tokenized', 'date', 'reviewID', 'reviewerID', 'reviewContent', 'hotelID'], axis=1)
X_test['featurePredict'] = modelXGB.predict(X_test_features)

y = y_test
X = X_test[['csrPredict', 'featurePredict']].copy()

In [0]:
y_pred = sgd.predict(X)
print(metrics.classification_report(y_test, y_pred))

In [0]:
###LDA and Bag of words


import gensim
from gensim import corpora, models

processed_docs = [x.split() for x in X_train['tokenized']]

dictionary = gensim.corpora.Dictionary(processed_docs)


#lda_model_tfidf = gensim.models.LdaMulticore(X_train_idf, num_topics=10, 
#                                             id2word=dictionary, passes=2, workers=4)

#for idx, topic in lda_model_tfidf.print_topics(-1):
#    print('Topic: {} Word: {}'.format(idx, topic))

In [0]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, 
                                       id2word=dictionary, passes=2, workers=2)

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))
    
    
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))